library(car)
## Loading required package: carData
gym <- read.csv("C:/Users/Valen/Downloads/gym_members_exercise_tracking.csv", header = TRUE)
head(gym)
## Age Gender Weight..kg. Height..m. Max_BPM Avg_BPM Resting_BPM
## 1 56 Male 88.3 1.71 180 157 60
## 2 46 Female 74.9 1.53 179 151 66
## 3 32 Female 68.1 1.66 167 122 54
## 4 25 Male 53.2 1.70 190 164 56
## 5 38 Male 46.1 1.79 188 158 68
## 6 56 Female 58.0 1.68 168 156 74
## Session_Duration..hours. Calories_Burned Workout_Type Fat_Percentage
## 1 1.69 1313 Yoga 12.6
## 2 1.30 883 HIIT 33.9
## 3 1.11 677 Cardio 33.4
## 4 0.59 532 Strength 28.8
## 5 0.64 556 Strength 29.2
## 6 1.59 1116 HIIT 15.5
## Water_Intake..liters. Workout_Frequency..days.week. Experience_Level BMI
## 1 3.5 4 3 30.20
## 2 2.1 4 2 32.00
## 3 2.3 4 2 24.71
## 4 2.1 3 1 18.41
## 5 2.8 3 1 14.39
## 6 2.7 5 3 20.55
str(gym)
## 'data.frame': 973 obs. of 15 variables:
## $ Age : int 56 46 32 25 38 56 36 40 28 28 ...
## $ Gender : chr "Male" "Female" "Female" "Male" ...
## $ Weight..kg. : num 88.3 74.9 68.1 53.2 46.1 ...
## $ Height..m. : num 1.71 1.53 1.66 1.7 1.79 1.68 1.72 1.51 1.94 1.84 ...
## $ Max_BPM : int 180 179 167 190 188 168 174 189 185 169 ...
## $ Avg_BPM : int 157 151 122 164 158 156 169 141 127 136 ...
## $ Resting_BPM : int 60 66 54 56 68 74 73 64 52 64 ...
## $ Session_Duration..hours. : num 1.69 1.3 1.11 0.59 0.64 1.59 1.49 1.27 1.03 1.08 ...
## $ Calories_Burned : num 1313 883 677 532 556 ...
## $ Workout_Type : chr "Yoga" "HIIT" "Cardio" "Strength" ...
## $ Fat_Percentage : num 12.6 33.9 33.4 28.8 29.2 15.5 21.3 30.6 28.9 29.7 ...
## $ Water_Intake..liters. : num 3.5 2.1 2.3 2.1 2.8 2.7 2.3 1.9 2.6 2.7 ...
## $ Workout_Frequency..days.week.: int 4 4 4 3 3 5 3 3 4 3 ...
## $ Experience_Level : int 3 2 2 1 1 3 2 2 2 1 ...
## $ BMI : num 30.2 32 24.7 18.4 14.4 ...
colSums(is.na(gym))
## Age Gender
## 0 0
## Weight..kg. Height..m.
## 0 0
## Max_BPM Avg_BPM
## 0 0
## Resting_BPM Session_Duration..hours.
## 0 0
## Calories_Burned Workout_Type
## 0 0
## Fat_Percentage Water_Intake..liters.
## 0 0
## Workout_Frequency..days.week. Experience_Level
## 0 0
## BMI
## 0
Since there are no missing values in this dataset, we can proceed directly with the analysis and evaluation of each variable, as well as examining the relationships between them.
plot(gym)
The matrix plot represents correlations, but are a little too small to interpret. As such, we continue with further analysis.
hist(gym$`Age`, main="Histogram of Weight", xlab="Age")
hist(gym$`Weight..kg.`, main="Histogram of Weight", xlab="Weight (kg)")
hist(gym$`Height..m.`, main="Histogram of Height", xlab="Height (m)")
hist(gym$`Max_BPM`, main="Histogram of Max_BPM", xlab="Max_BPM")
hist(gym$`Avg_BPM`, main="Histogram of Avg_BPM", xlab="Avg_BPM")
hist(gym$`Resting_BPM`, main="Histogram of Resting_BPM", xlab="Resting_BPM")
hist(gym$`Session_Duration..hours.`, main="Histogram of Session_Duration", xlab="Session_Duration (hours)")
hist(gym$`Calories_Burned`, main="Histogram of Calories_Burned", xlab="Calories_Burned")
hist(gym$`Fat_Percentage`, main="Histogram of Fat_Percentage", xlab="Fat_Percentage")
hist(gym$`Water_Intake..liters.`, main="Histogram of Water_Intake (liters)", xlab="Water_Intake (liters)")
hist(gym$`Workout_Frequency..days.week.`, main="Histogram of Workout_Frequency (days/week)", xlab="Workout_Frequency (days/week)")
hist(gym$`Experience_Level`, main="Histogram of Experience_Level", xlab="Experience_Level")
hist(gym$`BMI`, main="Histogram of BMI", xlab="BMI")
General trends of each column, but they are not really helpful on their own.
#### Workout_Type
table(gym$`Workout_Type`)
##
## Cardio HIIT Strength Yoga
## 255 221 258 239
# Summary of all categorical variables in the dataset
sapply(gym, function(x) if(is.factor(x)) table(x) else NULL)
## $Age
## NULL
##
## $Gender
## NULL
##
## $Weight..kg.
## NULL
##
## $Height..m.
## NULL
##
## $Max_BPM
## NULL
##
## $Avg_BPM
## NULL
##
## $Resting_BPM
## NULL
##
## $Session_Duration..hours.
## NULL
##
## $Calories_Burned
## NULL
##
## $Workout_Type
## NULL
##
## $Fat_Percentage
## NULL
##
## $Water_Intake..liters.
## NULL
##
## $Workout_Frequency..days.week.
## NULL
##
## $Experience_Level
## NULL
##
## $BMI
## NULL
library(ggplot2)
# Bar plot using ggplot2
ggplot(gym, aes(x = `Workout_Type`)) +
geom_bar() +
labs(title = "Bar Plot of Workout_Type", x = "Category", y = "Frequency")
#### Gender
table(gym$`Gender`)
##
## Female Male
## 462 511
# Bar plot using ggplot2
ggplot(gym, aes(x = `Gender`)) +
geom_bar() +
labs(title = "Bar Plot of Gender", x = "Category", y = "Frequency")
Similarly, we analysed the general trends for each categorical variable on its own.
As such, we move on to compare the effects of different variables on each other, mainly focusing on the dependent variable as the number of calories burned.
library(ggplot2)
# Function to calculate skewness
skewness_base <- function(x) {
n <- length(x)
mean_x <- mean(x, na.rm = TRUE)
sd_x <- sd(x, na.rm = TRUE)
skewness_value <- sum((x - mean_x)^3, na.rm = TRUE) / (n * sd_x^3)
return(skewness_value)
}
# Apply skewness function to multiple predictor variables
predictors <- c("Age", "Weight..kg.", "Height..m.", "Max_BPM", "Avg_BPM",
"Resting_BPM", "Session_Duration..hours.", "Calories_Burned",
"Fat_Percentage", "Water_Intake..liters.", "Workout_Frequency..days.week.",
"Experience_Level", "BMI")
# Create a data frame to store the skewness results
skewness_results <- data.frame(Predictor = predictors, Skewness = NA)
# Loop through the predictors and calculate skewness for each
for (i in 1:length(predictors)) {
skewness_results$Skewness[i] <- skewness_base(gym[[predictors[i]]])
}
# Print the skewness results
print(skewness_results)
## Predictor Skewness
## 1 Age -0.07762405
## 2 Weight..kg. 0.77000418
## 3 Height..m. 0.33781430
## 4 Max_BPM -0.03783355
## 5 Avg_BPM 0.08609487
## 6 Resting_BPM -0.07141518
## 7 Session_Duration..hours. 0.02568165
## 8 Calories_Burned 0.27746355
## 9 Fat_Percentage -0.63326746
## 10 Water_Intake..liters. 0.07125966
## 11 Workout_Frequency..days.week. 0.14935296
## 12 Experience_Level 0.31753176
## 13 BMI 0.76129495
for (predictor in predictors) {
p <- ggplot(gym, aes_string(x = predictor)) +
geom_histogram(binwidth = 1, color = "black", fill = "skyblue", alpha = 0.7) +
ggtitle(paste("Histogram of", predictor)) +
theme_minimal() +
labs(x = predictor, y = "Frequency")
print(p)
}
Skewness Analysis and Recommendations (ie. Should we perform a transformation?):
Age: -0.0776 Near symmetric (slightly negative skew). No transformation needed.
Weight (kg): 0.7700 Moderate positive skew. A log transformation might help reduce skewness and stabilize variance.
Height (m): 0.3378 Mild positive skew. This variable is not highly skewed, so a transformation is probably not necessary unless there’s another reason (e.g., heteroskedasticity).
Max BPM: -0.0378 Near symmetric (slightly negative skew). No transformation needed.
Avg BPM: 0.0861 Mild positive skew. Like Height, this isn’t strongly skewed, and transformation isn’t necessary unless there are other modeling issues.
Resting BPM: -0.0714 Near symmetric. No transformation needed.
Session Duration (hours): 0.0257 Near symmetric. No transformation needed.
Calories Burned: 0.2775 Mild positive skew. This is not highly skewed, so a transformation (like log) is not strictly necessary but could still be useful if heteroskedasticity is suspected.
Fat Percentage: -0.6333 Moderate negative skew. A log transformation or another transformation could help to make this variable more symmetric, especially if skewness is observed in the residuals.
Water Intake (liters): 0.0713 Mild positive skew. No transformation is strictly necessary here either unless needed for variance stabilisation.
# Load necessary libraries
if (!require(car)) install.packages("car", dependencies = TRUE)
if (!require(ggcorrplot)) install.packages("ggcorrplot", dependencies = TRUE)
## Loading required package: ggcorrplot
if (!require(reshape2)) install.packages("reshape2", dependencies = TRUE)
## Loading required package: reshape2
library(car)
library(ggcorrplot)
library(reshape2)
# Scatterplot Matrix
scatterplotMatrix(~ Calories_Burned + Age + Weight..kg. + Height..m. +
Max_BPM + Avg_BPM + Resting_BPM + Session_Duration..hours. +
Fat_Percentage + Water_Intake..liters. +
Workout_Frequency..days.week. + BMI,
data = gym,
spread = FALSE, # Disable spread smooth
lty.smooth = 2, # Dashed lines for smoothing
main = "Scatter Plot Matrix for Calories Burned and Predictors")
# Calculate correlations for numeric variables
numeric_vars <- gym[, sapply(gym, is.numeric)] # Select only numeric columns
correlation_matrix <- cor(numeric_vars, use = "complete.obs") # Correlation matrix
# Display the correlation matrix in the console
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(correlation_matrix)
## Age Weight..kg. Height..m.
## Age 1.000000000 -0.036339635 -0.027837495
## Weight..kg. -0.036339635 1.000000000 0.365321203
## Height..m. -0.027837495 0.365321203 1.000000000
## Max_BPM -0.017072597 0.057061130 -0.017659884
## Avg_BPM 0.035969143 0.009717478 -0.014776288
## Resting_BPM 0.004353714 -0.032138091 -0.005089864
## Session_Duration..hours. -0.019911904 -0.013665561 -0.010205897
## Calories_Burned -0.154678760 0.095443473 0.086348051
## Fat_Percentage 0.002370051 -0.225511640 -0.235520936
## Water_Intake..liters. 0.041528359 0.394275710 0.393532902
## Workout_Frequency..days.week. 0.008055163 -0.011769328 -0.011269883
## Experience_Level -0.018675927 0.003378528 -0.010266611
## BMI -0.013691370 0.853157690 -0.159468750
## Max_BPM Avg_BPM Resting_BPM
## Age -0.0170725970 0.0359691433 0.004353714
## Weight..kg. 0.0570611305 0.0097174780 -0.032138091
## Height..m. -0.0176598843 -0.0147762881 -0.005089864
## Max_BPM 1.0000000000 -0.0397514432 0.036647481
## Avg_BPM -0.0397514432 1.0000000000 0.059635502
## Resting_BPM 0.0366474807 0.0596355022 1.000000000
## Session_Duration..hours. 0.0100509814 0.0160144382 -0.016648808
## Calories_Burned 0.0020900159 0.3396586672 0.016517951
## Fat_Percentage -0.0090557315 -0.0073016551 -0.016834389
## Water_Intake..liters. 0.0316206428 -0.0029106374 0.007725998
## Workout_Frequency..days.week. -0.0290990657 -0.0106807977 -0.007966891
## Experience_Level 0.0005448337 -0.0008881572 0.001757585
## BMI 0.0671052310 0.0216054995 -0.032542632
## Session_Duration..hours. Calories_Burned
## Age -0.019911904 -0.154678760
## Weight..kg. -0.013665561 0.095443473
## Height..m. -0.010205897 0.086348051
## Max_BPM 0.010050981 0.002090016
## Avg_BPM 0.016014438 0.339658667
## Resting_BPM -0.016648808 0.016517951
## Session_Duration..hours. 1.000000000 0.908140376
## Calories_Burned 0.908140376 1.000000000
## Fat_Percentage -0.581519771 -0.597615248
## Water_Intake..liters. 0.283410977 0.356930683
## Workout_Frequency..days.week. 0.644140366 0.576150125
## Experience_Level 0.764768119 0.694129448
## BMI -0.006492647 0.059760826
## Fat_Percentage Water_Intake..liters.
## Age 0.002370051 0.041528359
## Weight..kg. -0.225511640 0.394275710
## Height..m. -0.235520936 0.393532902
## Max_BPM -0.009055731 0.031620643
## Avg_BPM -0.007301655 -0.002910637
## Resting_BPM -0.016834389 0.007725998
## Session_Duration..hours. -0.581519771 0.283410977
## Calories_Burned -0.597615248 0.356930683
## Fat_Percentage 1.000000000 -0.588682834
## Water_Intake..liters. -0.588682834 1.000000000
## Workout_Frequency..days.week. -0.537059548 0.238562571
## Experience_Level -0.654362613 0.304103549
## BMI -0.119257760 0.213696572
## Workout_Frequency..days.week. Experience_Level
## Age 0.008055163 -0.0186759269
## Weight..kg. -0.011769328 0.0033785279
## Height..m. -0.011269883 -0.0102666112
## Max_BPM -0.029099066 0.0005448337
## Avg_BPM -0.010680798 -0.0008881572
## Resting_BPM -0.007966891 0.0017575852
## Session_Duration..hours. 0.644140366 0.7647681189
## Calories_Burned 0.576150125 0.6941294479
## Fat_Percentage -0.537059548 -0.6543626129
## Water_Intake..liters. 0.238562571 0.3041035494
## Workout_Frequency..days.week. 1.000000000 0.8370787094
## Experience_Level 0.837078709 1.0000000000
## BMI 0.001644974 0.0160310726
## BMI
## Age -0.013691370
## Weight..kg. 0.853157690
## Height..m. -0.159468750
## Max_BPM 0.067105231
## Avg_BPM 0.021605500
## Resting_BPM -0.032542632
## Session_Duration..hours. -0.006492647
## Calories_Burned 0.059760826
## Fat_Percentage -0.119257760
## Water_Intake..liters. 0.213696572
## Workout_Frequency..days.week. 0.001644974
## Experience_Level 0.016031073
## BMI 1.000000000
# Visualize the correlation matrix with heatmap
ggcorrplot(correlation_matrix,
hc.order = TRUE, # Hierarchical clustering order
type = "lower", # Show lower triangular matrix
lab = TRUE, # Add correlation coefficients
title = "Correlation Matrix Heatmap")
# Create a flat correlation table
correlation_table <- melt(correlation_matrix)
colnames(correlation_table) <- c("Variable_1", "Variable_2", "Correlation")
# Remove self-correlations and duplicates
correlation_table <- subset(correlation_table, Variable_1 != Variable_2)
# Display the pairwise correlation table in the console
print("Pairwise Correlation Table:")
## [1] "Pairwise Correlation Table:"
print(correlation_table)
## Variable_1 Variable_2 Correlation
## 2 Weight..kg. Age -0.0363396345
## 3 Height..m. Age -0.0278374949
## 4 Max_BPM Age -0.0170725970
## 5 Avg_BPM Age 0.0359691433
## 6 Resting_BPM Age 0.0043537136
## 7 Session_Duration..hours. Age -0.0199119043
## 8 Calories_Burned Age -0.1546787599
## 9 Fat_Percentage Age 0.0023700512
## 10 Water_Intake..liters. Age 0.0415283591
## 11 Workout_Frequency..days.week. Age 0.0080551635
## 12 Experience_Level Age -0.0186759269
## 13 BMI Age -0.0136913703
## 14 Age Weight..kg. -0.0363396345
## 16 Height..m. Weight..kg. 0.3653212026
## 17 Max_BPM Weight..kg. 0.0570611305
## 18 Avg_BPM Weight..kg. 0.0097174780
## 19 Resting_BPM Weight..kg. -0.0321380907
## 20 Session_Duration..hours. Weight..kg. -0.0136655614
## 21 Calories_Burned Weight..kg. 0.0954434730
## 22 Fat_Percentage Weight..kg. -0.2255116400
## 23 Water_Intake..liters. Weight..kg. 0.3942757103
## 24 Workout_Frequency..days.week. Weight..kg. -0.0117693278
## 25 Experience_Level Weight..kg. 0.0033785279
## 26 BMI Weight..kg. 0.8531576899
## 27 Age Height..m. -0.0278374949
## 28 Weight..kg. Height..m. 0.3653212026
## 30 Max_BPM Height..m. -0.0176598843
## 31 Avg_BPM Height..m. -0.0147762881
## 32 Resting_BPM Height..m. -0.0050898641
## 33 Session_Duration..hours. Height..m. -0.0102058973
## 34 Calories_Burned Height..m. 0.0863480511
## 35 Fat_Percentage Height..m. -0.2355209358
## 36 Water_Intake..liters. Height..m. 0.3935329016
## 37 Workout_Frequency..days.week. Height..m. -0.0112698825
## 38 Experience_Level Height..m. -0.0102666112
## 39 BMI Height..m. -0.1594687498
## 40 Age Max_BPM -0.0170725970
## 41 Weight..kg. Max_BPM 0.0570611305
## 42 Height..m. Max_BPM -0.0176598843
## 44 Avg_BPM Max_BPM -0.0397514432
## 45 Resting_BPM Max_BPM 0.0366474807
## 46 Session_Duration..hours. Max_BPM 0.0100509814
## 47 Calories_Burned Max_BPM 0.0020900159
## 48 Fat_Percentage Max_BPM -0.0090557315
## 49 Water_Intake..liters. Max_BPM 0.0316206428
## 50 Workout_Frequency..days.week. Max_BPM -0.0290990657
## 51 Experience_Level Max_BPM 0.0005448337
## 52 BMI Max_BPM 0.0671052310
## 53 Age Avg_BPM 0.0359691433
## 54 Weight..kg. Avg_BPM 0.0097174780
## 55 Height..m. Avg_BPM -0.0147762881
## 56 Max_BPM Avg_BPM -0.0397514432
## 58 Resting_BPM Avg_BPM 0.0596355022
## 59 Session_Duration..hours. Avg_BPM 0.0160144382
## 60 Calories_Burned Avg_BPM 0.3396586672
## 61 Fat_Percentage Avg_BPM -0.0073016551
## 62 Water_Intake..liters. Avg_BPM -0.0029106374
## 63 Workout_Frequency..days.week. Avg_BPM -0.0106807977
## 64 Experience_Level Avg_BPM -0.0008881572
## 65 BMI Avg_BPM 0.0216054995
## 66 Age Resting_BPM 0.0043537136
## 67 Weight..kg. Resting_BPM -0.0321380907
## 68 Height..m. Resting_BPM -0.0050898641
## 69 Max_BPM Resting_BPM 0.0366474807
## 70 Avg_BPM Resting_BPM 0.0596355022
## 72 Session_Duration..hours. Resting_BPM -0.0166488077
## 73 Calories_Burned Resting_BPM 0.0165179507
## 74 Fat_Percentage Resting_BPM -0.0168343892
## 75 Water_Intake..liters. Resting_BPM 0.0077259978
## 76 Workout_Frequency..days.week. Resting_BPM -0.0079668912
## 77 Experience_Level Resting_BPM 0.0017575852
## 78 BMI Resting_BPM -0.0325426318
## 79 Age Session_Duration..hours. -0.0199119043
## 80 Weight..kg. Session_Duration..hours. -0.0136655614
## 81 Height..m. Session_Duration..hours. -0.0102058973
## 82 Max_BPM Session_Duration..hours. 0.0100509814
## 83 Avg_BPM Session_Duration..hours. 0.0160144382
## 84 Resting_BPM Session_Duration..hours. -0.0166488077
## 86 Calories_Burned Session_Duration..hours. 0.9081403755
## 87 Fat_Percentage Session_Duration..hours. -0.5815197713
## 88 Water_Intake..liters. Session_Duration..hours. 0.2834109774
## 89 Workout_Frequency..days.week. Session_Duration..hours. 0.6441403664
## 90 Experience_Level Session_Duration..hours. 0.7647681189
## 91 BMI Session_Duration..hours. -0.0064926468
## 92 Age Calories_Burned -0.1546787599
## 93 Weight..kg. Calories_Burned 0.0954434730
## 94 Height..m. Calories_Burned 0.0863480511
## 95 Max_BPM Calories_Burned 0.0020900159
## 96 Avg_BPM Calories_Burned 0.3396586672
## 97 Resting_BPM Calories_Burned 0.0165179507
## 98 Session_Duration..hours. Calories_Burned 0.9081403755
## 100 Fat_Percentage Calories_Burned -0.5976152477
## 101 Water_Intake..liters. Calories_Burned 0.3569306830
## 102 Workout_Frequency..days.week. Calories_Burned 0.5761501255
## 103 Experience_Level Calories_Burned 0.6941294479
## 104 BMI Calories_Burned 0.0597608261
## 105 Age Fat_Percentage 0.0023700512
## 106 Weight..kg. Fat_Percentage -0.2255116400
## 107 Height..m. Fat_Percentage -0.2355209358
## 108 Max_BPM Fat_Percentage -0.0090557315
## 109 Avg_BPM Fat_Percentage -0.0073016551
## 110 Resting_BPM Fat_Percentage -0.0168343892
## 111 Session_Duration..hours. Fat_Percentage -0.5815197713
## 112 Calories_Burned Fat_Percentage -0.5976152477
## 114 Water_Intake..liters. Fat_Percentage -0.5886828341
## 115 Workout_Frequency..days.week. Fat_Percentage -0.5370595483
## 116 Experience_Level Fat_Percentage -0.6543626129
## 117 BMI Fat_Percentage -0.1192577600
## 118 Age Water_Intake..liters. 0.0415283591
## 119 Weight..kg. Water_Intake..liters. 0.3942757103
## 120 Height..m. Water_Intake..liters. 0.3935329016
## 121 Max_BPM Water_Intake..liters. 0.0316206428
## 122 Avg_BPM Water_Intake..liters. -0.0029106374
## 123 Resting_BPM Water_Intake..liters. 0.0077259978
## 124 Session_Duration..hours. Water_Intake..liters. 0.2834109774
## 125 Calories_Burned Water_Intake..liters. 0.3569306830
## 126 Fat_Percentage Water_Intake..liters. -0.5886828341
## 128 Workout_Frequency..days.week. Water_Intake..liters. 0.2385625706
## 129 Experience_Level Water_Intake..liters. 0.3041035494
## 130 BMI Water_Intake..liters. 0.2136965719
## 131 Age Workout_Frequency..days.week. 0.0080551635
## 132 Weight..kg. Workout_Frequency..days.week. -0.0117693278
## 133 Height..m. Workout_Frequency..days.week. -0.0112698825
## 134 Max_BPM Workout_Frequency..days.week. -0.0290990657
## 135 Avg_BPM Workout_Frequency..days.week. -0.0106807977
## 136 Resting_BPM Workout_Frequency..days.week. -0.0079668912
## 137 Session_Duration..hours. Workout_Frequency..days.week. 0.6441403664
## 138 Calories_Burned Workout_Frequency..days.week. 0.5761501255
## 139 Fat_Percentage Workout_Frequency..days.week. -0.5370595483
## 140 Water_Intake..liters. Workout_Frequency..days.week. 0.2385625706
## 142 Experience_Level Workout_Frequency..days.week. 0.8370787094
## 143 BMI Workout_Frequency..days.week. 0.0016449737
## 144 Age Experience_Level -0.0186759269
## 145 Weight..kg. Experience_Level 0.0033785279
## 146 Height..m. Experience_Level -0.0102666112
## 147 Max_BPM Experience_Level 0.0005448337
## 148 Avg_BPM Experience_Level -0.0008881572
## 149 Resting_BPM Experience_Level 0.0017575852
## 150 Session_Duration..hours. Experience_Level 0.7647681189
## 151 Calories_Burned Experience_Level 0.6941294479
## 152 Fat_Percentage Experience_Level -0.6543626129
## 153 Water_Intake..liters. Experience_Level 0.3041035494
## 154 Workout_Frequency..days.week. Experience_Level 0.8370787094
## 156 BMI Experience_Level 0.0160310726
## 157 Age BMI -0.0136913703
## 158 Weight..kg. BMI 0.8531576899
## 159 Height..m. BMI -0.1594687498
## 160 Max_BPM BMI 0.0671052310
## 161 Avg_BPM BMI 0.0216054995
## 162 Resting_BPM BMI -0.0325426318
## 163 Session_Duration..hours. BMI -0.0064926468
## 164 Calories_Burned BMI 0.0597608261
## 165 Fat_Percentage BMI -0.1192577600
## 166 Water_Intake..liters. BMI 0.2136965719
## 167 Workout_Frequency..days.week. BMI 0.0016449737
## 168 Experience_Level BMI 0.0160310726
Again, these are a few methods to analyse the general trends and correlations between the variables, without any transformations implemented. We dive deeper into transformations, assumptions and models below.
model <- lm(Calories_Burned ~ Age + Gender + Weight..kg. + Height..m. + Max_BPM + Avg_BPM +
Resting_BPM + Session_Duration..hours. + Workout_Type + Fat_Percentage +
Water_Intake..liters. + Workout_Frequency..days.week. + Experience_Level + BMI,
data = gym)
summary(model)
##
## Call:
## lm(formula = Calories_Burned ~ Age + Gender + Weight..kg. + Height..m. +
## Max_BPM + Avg_BPM + Resting_BPM + Session_Duration..hours. +
## Workout_Type + Fat_Percentage + Water_Intake..liters. + Workout_Frequency..days.week. +
## Experience_Level + BMI, data = gym)
##
## Residuals:
## Min 1Q Median 3Q Max
## -126.56 -24.59 -2.07 23.11 174.24
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.026e+03 8.746e+01 -11.732 <2e-16 ***
## Age -3.421e+00 1.051e-01 -32.567 <2e-16 ***
## GenderMale 8.281e+01 4.586e+00 18.058 <2e-16 ***
## Weight..kg. -1.145e+00 5.111e-01 -2.239 0.0254 *
## Height..m. 1.161e+02 4.705e+01 2.469 0.0137 *
## Max_BPM 4.258e-02 1.108e-01 0.384 0.7008
## Avg_BPM 6.238e+00 8.877e-02 70.278 <2e-16 ***
## Resting_BPM 3.894e-01 1.740e-01 2.238 0.0255 *
## Session_Duration..hours. 7.140e+02 5.911e+00 120.805 <2e-16 ***
## Workout_TypeHIIT -8.856e-01 3.650e+00 -0.243 0.8084
## Workout_TypeStrength -1.897e+00 3.515e+00 -0.540 0.5895
## Workout_TypeYoga -6.775e+00 3.583e+00 -1.891 0.0589 .
## Fat_Percentage -4.430e-01 3.364e-01 -1.317 0.1881
## Water_Intake..liters. -1.482e+00 3.250e+00 -0.456 0.6485
## Workout_Frequency..days.week. 1.754e+00 2.552e+00 0.687 0.4920
## Experience_Level -2.484e+00 3.984e+00 -0.623 0.5331
## BMI 3.811e+00 1.555e+00 2.450 0.0145 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.5 on 956 degrees of freedom
## Multiple R-squared: 0.9794, Adjusted R-squared: 0.979
## F-statistic: 2835 on 16 and 956 DF, p-value: < 2.2e-16
Multiple R-squared value is 0.9794, which aligns with the claim that 97.9% of the variance in the dependent variable (Calories_Burned) is explained by the predictors in the model.
The Adjusted R-squared value is 0.979, which is very close to the Multiple R-squared value. This indicates that the high R^2 value is robust and not inflated by unnecessary predictors.
The extremely high F-statistic (2835) and the very small p-value (< 2.2e-16) suggest that the predictors are highly significant in explaining the variance in the outcome variable.
Predictors with p-value < 0.05 significantly contribute to explaining Calories_Burned:
Age: Negative coefficient suggests that older individuals tend to burn less calories.
GenderMale: Positive coefficient indicates that males tend to burn more calories than females.
Weight..kg: There is a slight negative effect on calories burned.
Height..m.: Positive relationship, taller individuals burn more calories.
Avg_BPM: Strong positive association with calories burned.
Resting_BPM: Small but significant positive effect.
Session_Duration..hours.: Very strong positive effect.
BMI: Positive effect, higher BMI correlates with more calories burned.
Predictors with p-value > 0.05 do not significantly contribute to the model, which could possibly be removed from the regression model but further evaluation is required. These predictors include Max_BPM, Workout_Type (all levels), Fat_Percentage, Water_Intake..liters., Workout_Frequency..days.week., and Experience_Level.
# Fit the model
model <- lm(Calories_Burned ~ Age + Gender + Weight..kg. + Height..m. +
Max_BPM + Avg_BPM + Resting_BPM + Session_Duration..hours. +
Workout_Type + Fat_Percentage + Water_Intake..liters. +
Workout_Frequency..days.week. + Experience_Level + BMI, data = gym)
# Plot diagnostic plots
plot(model)
# Extract residuals from the model
residuals <- residuals(model)
# Create the Q-Q plot
qqnorm(residuals)
qqline(residuals, col = "red") # Add a reference line
The points mostly follow the red diagonal line, indicating the data roughly adheres to a normal distribution. There is noticeable deviation at the extreme ends, whereby the points at these extremes curve away from the diagonal, suggesting that the data might have heavier tails than expected under a normal distribution. This is indicative of potential non-normality in the tails (e.g., outliers or skewness).
# Shapiro-Wilk test
shapiro.test(residuals(model))
##
## Shapiro-Wilk normality test
##
## data: residuals(model)
## W = 0.98463, p-value = 1.372e-08
A W value of 0.98463 suggests a slight deviation from normality. Since the p-value = 1.372e-08 < 0.05, we can reject the null hypothesis. This means that there is enough evidence to suggest that the residuals are not normally distributed.
library(car)
set.seed(123)
durbinWatsonTest(model)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.05876589 2.115946 0.058
## Alternative hypothesis: rho != 0
The non significant p-value (p = 0.058 > 0.05) suggest that there is no evidence of autocorrelation. This confirms that the residuals are independent, which satisfies the assumption of independence for linear regression.
From our understanding of independence, the data set contains information from each individual gym member, in which their gym routines do not influence each other. As such, it is safe to say that the independence assumption is fulfilled.
crPlots(model)
These variables show no significant deviations from the pink line, suggesting they have a linear relationship with the dependent variables: Height (m), Max_BPM, Avg_BPM, Resting_BPM, Session_Duration (hours), Fat_Percentage, Water_Intake (liters), Workout_Frequency (days/week), Experience_Level
These variables do not require testing for linearity:
Gender: Categorical variable with two levels.
Workout_Type: Categorical variable with multiple levels.
The following variables show very slight curvature or trends in the pink line, indicating slight non-linearity that may require transformation or the inclusion of higher-order terms:
Age: Slight curvature suggests adding a quadratic term (Age^2) might improve the model.
Weight (kg): Curvature indicates potential non linearity, consider a transformation like log(Weight) or a polynomial term.
Fat_Percentage: Subtle curvature; transformation may be needed depending on the variable’s importance.
BMI: Slight curvature; consider transformations such as log(BMI) or polynomial terms.
Overall, the linearity assumption is fulfilled.
ncvTest(model)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 40.71479, Df = 1, p = 1.7615e-10
spreadLevelPlot(model)
##
## Suggested power transformation: 1.094521
Since the p-value = 1.7615e-10 < 0.05, we can reject the null hypothesis and conclude that the homoskedasticity assumption is violated. Furthermore, we see that there is a non-horizontal trend in the plot, which suggests a violation of the assumption of constant variance.
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.020363 1 1.010130
## Gender 3.270507 1 1.808454
## Weight..kg. 73.209600 1 8.556261
## Height..m. 22.497960 1 4.743201
## Max_BPM 1.015545 1 1.007742
## Avg_BPM 1.010233 1 1.005103
## Resting_BPM 1.012973 1 1.006465
## Session_Duration..hours. 2.561235 1 1.600386
## Workout_Type 1.040697 3 1.006671
## Fat_Percentage 2.762111 1 1.661960
## Water_Intake..liters. 2.370188 1 1.539542
## Workout_Frequency..days.week. 3.382808 1 1.839241
## Experience_Level 5.411264 1 2.326212
## BMI 66.873035 1 8.177593
sqrt(vif(model)) > 2
## GVIF Df GVIF^(1/(2*Df))
## Age FALSE FALSE FALSE
## Gender FALSE FALSE FALSE
## Weight..kg. TRUE FALSE TRUE
## Height..m. TRUE FALSE TRUE
## Max_BPM FALSE FALSE FALSE
## Avg_BPM FALSE FALSE FALSE
## Resting_BPM FALSE FALSE FALSE
## Session_Duration..hours. FALSE FALSE FALSE
## Workout_Type FALSE FALSE FALSE
## Fat_Percentage FALSE FALSE FALSE
## Water_Intake..liters. FALSE FALSE FALSE
## Workout_Frequency..days.week. FALSE FALSE FALSE
## Experience_Level TRUE FALSE FALSE
## BMI TRUE FALSE TRUE
Weight, Height, and BMI are closely related because BMI is calculated using Weight and Height. This leads to high multicollinearity between these variables. Since BMI is a function of weight and height, we can remove these 2 variables and keep BMI in the model
Experience Level also has a moderate VIF, suggesting that it might be somewhat correlated with other predictors in the model (e.g., age or workout-related variables), but this is not as severe as with Weight and Height. Since the correlation of Experience Level with other predictors is not very high and Experience Level is a meaningful predictor in our analysis, we will keep it in the model.
# Fit the model (exclude weight and height)
umodel <- lm(Calories_Burned ~ Age + Gender +
Max_BPM + Avg_BPM + Resting_BPM + Session_Duration..hours. +
Workout_Type + Fat_Percentage + Water_Intake..liters. +
Workout_Frequency..days.week. + Experience_Level + BMI, data = gym)
outlierTest(umodel)
## rstudent unadjusted p-value Bonferroni p
## 911 4.516448 7.0729e-06 0.0068819
## 512 4.253489 2.3112e-05 0.0224880
The unadjusted p-values for both observations (911 & 512) are very small (less than 0.05), which suggests that these data points may be outliers. The Bonferroni-adjusted p-values account for multiple comparisons and are still below 0.05, reinforcing that these observations are potential outliers.
As such, we proceeded to remove the 2 outliers.
# Remove the outliers (observations 911 and 512)
gym_clean <- gym[-c(911, 512), ]
# Refit the model without the outliers
model_clean <- lm(Calories_Burned ~ Age + Gender + Max_BPM + Avg_BPM + Resting_BPM +
Session_Duration..hours. + Workout_Type + Fat_Percentage + Water_Intake..liters. +
Workout_Frequency..days.week. + Experience_Level + BMI, data = gym_clean)
# Check for outliers again
outlierTest(model_clean)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 573 3.715856 0.00021428 0.20806
The result indicates that after removing the two outliers, there are no remaining observations with significant studentized residuals (p < 0.05). This suggests that the model no longer has influential outliers, and the remaining data is less likely to have a disproportionate impact on the regression results.
hat.plot <- function(model_clean) {
p <- length(coef(model_clean)) # Number of predictors
n <- length(fitted(model_clean)) # Number of observations
# Get hat values (leverage)
leverage_values <- hatvalues(model_clean)
# Plot hat values (leverage)
plot(hatvalues(model_clean),
main = "Index Plot of Hat Values",
xlab = "Observation Index",
ylab = "Hat Value",
pch = 16,
col = "blue")
# Add horizontal lines for high leverage threshold
abline(h = c(2, 3) * p / n, col = "red", lty = 2)
# Identify high leverage points (above threshold)
threshold <- 2 * p / n
high_leverage_indices <- which(leverage_values > threshold)
# Print the rows of the dataset that correspond to high leverage points
high_leverage_points <- gym_clean[high_leverage_indices, ]
print(high_leverage_points)
}
hat.plot(model_clean)
## Age Gender Weight..kg. Height..m. Max_BPM Avg_BPM Resting_BPM
## 262 57 Male 126.8 1.63 161 133 73
## Session_Duration..hours. Calories_Burned Workout_Type Fat_Percentage
## 262 0.96 632 Yoga 20.8
## Water_Intake..liters. Workout_Frequency..days.week. Experience_Level BMI
## 262 2.1 2 1 47.72
Very clearly, there is a point very much above the red line, indicating very high leverage value, as tested using the threshold of 2p/n. However, it does not equate to the point being an outlier, so we did not remove it.
# Calculate Cook's Distance
cooks_d <- cooks.distance(model_clean)
# Plot Cook's Distance
plot(cooks_d, type = "h", main = "Cook's Distance", ylab = "Cook's Distance", xlab = "Index")
abline(h = 4 / length(cooks_d), col = "red", lty = 2)
# Identify influential points
influential_points <- which(cooks_d > 4 / length(cooks_d))
print(influential_points)
## 4 7 45 47 78 90 91 106 107 116 125 149 152 153 158 178 230 277 284 316
## 4 7 45 47 78 90 91 106 107 116 125 149 152 153 158 178 230 277 284 316
## 322 329 343 372 381 395 404 408 410 429 468 476 486 498 573 592 605 614 620 645
## 322 329 343 372 381 395 404 408 410 429 468 476 486 498 572 591 604 613 619 644
## 647 650 651 669 693 695 711 713 737 738 739 759 785 797 798 826 844 878 898 912
## 646 649 650 668 692 694 710 712 736 737 738 758 784 796 797 825 843 877 897 910
## 943 949 958 966
## 941 947 956 964
The points above the red line are identified as influential points and are printed above. Since these points represent legitimate data from different individuals with unique characteristics, we cannot remove them from the analysis.
# Load necessary library
library(car)
# Generate Influence Plot for the model
influencePlot(model_clean, main= "Influence Plot", sub = "Circle size is proportional to Cook's Distance")
## StudRes Hat CookD
## 107 -3.1962396 0.02169546 0.0149594482
## 134 0.4003586 0.02676135 0.0002940879
## 262 0.5011883 0.03754950 0.0006538478
## 573 3.7158558 0.01409293 0.0129841126
## 713 3.4666853 0.01892426 0.0152783810
## 898 3.6014153 0.01525331 0.0132278947
Observations 713, 573, and 898 have high Cook’s Distance, indicating that they are influential.
Observation 262 has a high leverage value, meaning it is distant from the center of the predictor values. High leverage points are not necessarily problematic unless combined with high residuals or Cook’s Distance. As such, we did not remove 262.
Observation 107 is flagged as an outlier because its residual is greater than ±3. This indicates it does not fit well with the model’s predictions. However, being an outlier alone is not always sufficient to warrant removal. Observation 107 has a Cook’s Distance below the common threshold of 0.5, which suggests it is not highly influential on the overall regression model. This indicates that its removal may not drastically change the model coefficients. Therefore, we did not remove Observation 107.
library(car)
# Box-Cox transformation for the response variable
boxcox_result <- powerTransform(Calories_Burned ~ Age + Max_BPM + Avg_BPM + Resting_BPM +
Session_Duration..hours. + Fat_Percentage + Water_Intake..liters. +
Workout_Frequency..days.week. + Experience_Level + BMI,
data = gym_clean)
# View the suggested lambda values
summary(boxcox_result)
## bcPower Transformation to Normality
## Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1 0.7617 0.76 0.7056 0.8178
##
## Likelihood ratio test that transformation parameter is equal to 0
## (log transformation)
## LRT df pval
## LR test, lambda = (0) 576.6828 1 < 2.22e-16
##
## Likelihood ratio test that no transformation is needed
## LRT df pval
## LR test, lambda = (1) 66.42937 1 3.3307e-16
The Box-Cox transformation suggests applying a power transformation with lambda = 0.76 to stabilize variance and improve model fit.
gym_clean$Calories_Burned_transform <- (gym_clean$Calories_Burned^0.76 - 1) / 0.76
model_box <- lm(Calories_Burned_transform ~ Age + Gender + Max_BPM + Avg_BPM + Resting_BPM + Session_Duration..hours. + Workout_Type + Fat_Percentage + Water_Intake..liters. + Workout_Frequency..days.week. + Experience_Level + BMI,
data = gym_clean)
summary(model_box)
##
## Call:
## lm(formula = Calories_Burned_transform ~ Age + Gender + Max_BPM +
## Avg_BPM + Resting_BPM + Session_Duration..hours. + Workout_Type +
## Fat_Percentage + Water_Intake..liters. + Workout_Frequency..days.week. +
## Experience_Level + BMI, data = gym_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.9879 -4.0736 0.1496 4.4612 23.0354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.161e+02 5.424e+00 -21.398 < 2e-16 ***
## Age -6.510e-01 1.814e-02 -35.892 < 2e-16 ***
## GenderMale 1.861e+01 6.656e-01 27.962 < 2e-16 ***
## Max_BPM 9.391e-05 1.925e-02 0.005 0.996109
## Avg_BPM 1.208e+00 1.543e-02 78.281 < 2e-16 ***
## Resting_BPM 5.022e-02 3.020e-02 1.663 0.096617 .
## Session_Duration..hours. 1.435e+02 1.026e+00 139.832 < 2e-16 ***
## Workout_TypeHIIT -1.204e-01 6.325e-01 -0.190 0.849117
## Workout_TypeStrength -6.552e-01 6.102e-01 -1.074 0.283241
## Workout_TypeYoga -1.343e+00 6.211e-01 -2.163 0.030804 *
## Fat_Percentage 1.937e-01 5.844e-02 3.315 0.000951 ***
## Water_Intake..liters. -1.038e+00 5.636e-01 -1.843 0.065700 .
## Workout_Frequency..days.week. 4.492e-01 4.427e-01 1.015 0.310549
## Experience_Level -4.293e-01 6.913e-01 -0.621 0.534760
## BMI 5.341e-02 3.498e-02 1.527 0.127123
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.852 on 956 degrees of freedom
## Multiple R-squared: 0.9835, Adjusted R-squared: 0.9833
## F-statistic: 4080 on 14 and 956 DF, p-value: < 2.2e-16
Several predictors (e.g., Age, Gender, Max_BPM, Avg_BPM, Session_Duration.hours.) are statistically significant (p < 0.05), contributing meaningfully to predicting Calories_Burned_transform.
Multiple R-squared (0.9835) and Adjusted R-squared (0.9833): These indicate that the model explains 98.35% of the variance in the transformed Calories_Burned variable, a very strong fit.
F-statistic (4080) with a p-value (< 2.2e-16): Indicates the model is statistically significant overall.
Before transformation, the residuals showed heteroskedasticity and non-normality. The Box-Cox transformation (lambda = 0.76) addresses these issues, resulting in a better-fitted model with well-behaved residuals.
# Extract residuals from the model
residuals1 <- residuals(model_box)
# Create the Q-Q plot
qqnorm(residuals1)
qqline(residuals1, col = "red") # Add a reference line
# Shapiro-Wilk test
shapiro.test(residuals(model_box))
##
## Shapiro-Wilk normality test
##
## data: residuals(model_box)
## W = 0.99096, p-value = 1.117e-05
After removing the outliers and conducting power transformation, the residuals are generally well-aligned along the red line, with some deviation at the tails. The W value has increased from 0.98463 to 0.99096, indicating a closer adherence to normality. While the Shapiro-Wilk test yields a p-value of 1.117e-05, suggesting rejection of the null hypothesis of normality, this result is likely influenced by the large sample size as the test becomes highly sensitive to minor deviations from normality in larger datasets.
Given that the deviations are minor and primarily in the tails, and that the W value is close to 1, we conclude that the residuals sufficiently satisfy the normality assumption for practical purposes in the new model, where outliers have been removed, weight and height have been excluded, and power transformation has been conducted.
library(car)
durbinWatsonTest(model_box)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.05929336 2.116519 0.076
## Alternative hypothesis: rho != 0
The non significant p-value (p = 0.076 > 0.05) suggest that there is no evidence of autocorrelation. This confirms that the residuals are independent, which satisfies the assumption of independence for linear regression.
crPlots(model_box)
We observe that all variables show no significant deviations from the pink line, suggesting that linearity is satisfied.
ncvTest(model_box)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 8.194697, Df = 1, p = 0.0042013
spreadLevelPlot(model_box)
##
## Suggested power transformation: 1.078873
Since the p-value = 0.0042013 < 0.05, we can reject the null hypothesis and conclude that the homoskedasticity assumption is violated. Furthermore, we see that there is a non-horizontal trend in the plot, which suggests a violation of the assumption of constant variance.
Since heteroskedasticity is present, we proceed to fit another model via the Weighted Least Sqaures method.
# Step 1: Fit initial model and compute residuals
residuals <- abs(residuals(model_box))
# Step 2: Fit a model to predict residuals (e.g., using fitted values)
weights <- 1 / (fitted(lm(residuals ~ fitted(model_box)))^2)
# Step 3: Refit model using weights
model_wls <- lm(Calories_Burned_transform ~ Age + Gender + Max_BPM + Avg_BPM + Resting_BPM +
Session_Duration..hours. + Workout_Type + Fat_Percentage + Water_Intake..liters. +
Workout_Frequency..days.week. + Experience_Level + BMI,
data = gym_clean, weights = weights)
summary(model_wls)
##
## Call:
## lm(formula = Calories_Burned_transform ~ Age + Gender + Max_BPM +
## Avg_BPM + Resting_BPM + Session_Duration..hours. + Workout_Type +
## Fat_Percentage + Water_Intake..liters. + Workout_Frequency..days.week. +
## Experience_Level + BMI, data = gym_clean, weights = weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -5.6037 -0.7864 0.0507 0.8603 4.2366
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.132e+02 5.350e+00 -21.153 < 2e-16 ***
## Age -6.361e-01 1.801e-02 -35.313 < 2e-16 ***
## GenderMale 1.811e+01 6.573e-01 27.546 < 2e-16 ***
## Max_BPM 3.888e-05 1.913e-02 0.002 0.998378
## Avg_BPM 1.184e+00 1.534e-02 77.176 < 2e-16 ***
## Resting_BPM 4.536e-02 2.998e-02 1.513 0.130668
## Session_Duration..hours. 1.437e+02 1.004e+00 143.088 < 2e-16 ***
## Workout_TypeHIIT -1.572e-01 6.289e-01 -0.250 0.802694
## Workout_TypeStrength -7.828e-01 6.057e-01 -1.292 0.196546
## Workout_TypeYoga -1.300e+00 6.158e-01 -2.110 0.035101 *
## Fat_Percentage 2.012e-01 5.788e-02 3.476 0.000531 ***
## Water_Intake..liters. -9.322e-01 5.531e-01 -1.685 0.092224 .
## Workout_Frequency..days.week. 3.647e-01 4.401e-01 0.829 0.407510
## Experience_Level -4.070e-01 6.873e-01 -0.592 0.553838
## BMI 5.331e-02 3.453e-02 1.544 0.122916
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.308 on 956 degrees of freedom
## Multiple R-squared: 0.9838, Adjusted R-squared: 0.9835
## F-statistic: 4141 on 14 and 956 DF, p-value: < 2.2e-16
Multiple R-squared = 0.9838 and Adjusted R-squared = 0.9835: Indicate the model explains 98.38% of the variance in the transformed dependent variable. This is consistent with a well-fitted model.
F-statistic (4141): A high F-statistic and a very small p-value suggest that the overall model is statistically significant.
Many predictors remain statistically significant (p < 0.05): Age, GenderMale, Avg_BPM, and Session_Duration.hours. are particularly impactful. Workout_TypeYoga and Fat_Percentage are also significant but less so compared to other predictors.
However, there is minimal impact on R-squared values compared to the original model. R^2 values are nearly identical to those in the original model, suggesting that the fit has not changed dramatically, but is now more robust.
# Extract residuals from the model
residuals2 <- residuals(model_wls)
# Create the Q-Q plot
qqnorm(residuals2)
qqline(residuals2, col = "red") # Add a reference line
# Shapiro-Wilk test
shapiro.test(residuals(model_wls))
##
## Shapiro-Wilk normality test
##
## data: residuals(model_wls)
## W = 0.9905, p-value = 6.431e-06
The high W value of 0.9905 along with the points mostly lying along the red line suggest that the normality assumption is satisfied.
library(car)
durbinWatsonTest(model_wls)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.06124994 2.120462 0.06
## Alternative hypothesis: rho != 0
The non significant p-value (p = 0.06 > 0.05) suggest that there is no evidence of autocorrelation. This confirms that the residuals are independent, which satisfies the assumption of independence for linear regression.
crPlots(model_wls)
We observe that all variables show no significant deviations from the pink line, suggesting that linearity is satisfied.
ncvTest(model_wls)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 1.84367, Df = 1, p = 0.17452
spreadLevelPlot(model_wls)
##
## Suggested power transformation: 1.116163
Since the p-value (0.17452) is greater than 0.05, there is insufficient evidence to reject the null hypothesis, indicating that the assumption of homoskedasticity is satisfied. Additionally, the plot shows a roughly horizontal trend, confirming that the issue of heteroskedasticity has been resolved. This demonstrates that the transformed data now meets the assumption of constant variance.
Conclusion for Weighted Least Squares Model: Fulfills all 4 assumptions.
The model_wls has a high R-squared value of 0.9838, which suggests good fit. However, this could also indicate potential overfitting, where the model may have captured noise in addition to the true relationships.
To address this, we proceed with Stepwise Regression to simplify the model by removing less relevant predictors, thus reducing complexity and mitigating the risk of overfitting.
# Remove Experience_Level from the model and create a new model
model_wls_no_experience <- lm(Calories_Burned_transform ~ Age + Gender + Max_BPM + Avg_BPM + Resting_BPM +
Session_Duration..hours. + Workout_Type + Fat_Percentage + Water_Intake..liters. +
Workout_Frequency..days.week. + BMI, data = gym_clean, weights = weights)
# Run stepwise regression
model_stepwise <- step(model_wls_no_experience, direction = "both", trace = 1)
## Start: AIC=534.02
## Calories_Burned_transform ~ Age + Gender + Max_BPM + Avg_BPM +
## Resting_BPM + Session_Duration..hours. + Workout_Type + Fat_Percentage +
## Water_Intake..liters. + Workout_Frequency..days.week. + BMI
##
## Df Sum of Sq RSS AIC
## - Max_BPM 1 0 1635 532.0
## - Workout_Frequency..days.week. 1 1 1636 532.4
## - Workout_Type 3 10 1645 533.7
## <none> 1635 534.0
## - Resting_BPM 1 4 1639 534.3
## - BMI 1 4 1639 534.4
## - Water_Intake..liters. 1 5 1640 535.0
## - Fat_Percentage 1 25 1661 547.0
## - Gender 1 1334 2969 1111.3
## - Age 1 2132 3767 1342.4
## - Avg_BPM 1 10183 11818 2452.6
## - Session_Duration..hours. 1 42780 44416 3738.2
##
## Step: AIC=532.02
## Calories_Burned_transform ~ Age + Gender + Avg_BPM + Resting_BPM +
## Session_Duration..hours. + Workout_Type + Fat_Percentage +
## Water_Intake..liters. + Workout_Frequency..days.week. + BMI
##
## Df Sum of Sq RSS AIC
## - Workout_Frequency..days.week. 1 1 1636 530.4
## - Workout_Type 3 10 1645 531.7
## <none> 1635 532.0
## - Resting_BPM 1 4 1639 532.3
## - BMI 1 4 1639 532.4
## - Water_Intake..liters. 1 5 1640 533.0
## + Max_BPM 1 0 1635 534.0
## - Fat_Percentage 1 25 1661 545.0
## - Gender 1 1336 2971 1109.8
## - Age 1 2133 3768 1340.6
## - Avg_BPM 1 10202 11837 2452.1
## - Session_Duration..hours. 1 42813 44448 3736.9
##
## Step: AIC=530.36
## Calories_Burned_transform ~ Age + Gender + Avg_BPM + Resting_BPM +
## Session_Duration..hours. + Workout_Type + Fat_Percentage +
## Water_Intake..liters. + BMI
##
## Df Sum of Sq RSS AIC
## - Workout_Type 3 9 1645 530.0
## <none> 1636 530.4
## - Resting_BPM 1 4 1640 530.6
## - BMI 1 4 1640 530.8
## - Water_Intake..liters. 1 5 1641 531.3
## + Workout_Frequency..days.week. 1 1 1635 532.0
## + Max_BPM 1 0 1636 532.4
## - Fat_Percentage 1 25 1661 543.3
## - Gender 1 1361 2997 1116.3
## - Age 1 2132 3768 1338.7
## - Avg_BPM 1 10203 11839 2450.3
## - Session_Duration..hours. 1 52940 54576 3934.2
##
## Step: AIC=529.98
## Calories_Burned_transform ~ Age + Gender + Avg_BPM + Resting_BPM +
## Session_Duration..hours. + Fat_Percentage + Water_Intake..liters. +
## BMI
##
## Df Sum of Sq RSS AIC
## <none> 1645 530.0
## + Workout_Type 3 9 1636 530.4
## - Resting_BPM 1 4 1649 530.4
## - Water_Intake..liters. 1 4 1649 530.5
## - BMI 1 5 1650 530.9
## + Workout_Frequency..days.week. 1 0 1645 531.7
## + Max_BPM 1 0 1645 532.0
## - Fat_Percentage 1 26 1671 543.0
## - Gender 1 1353 2998 1110.8
## - Age 1 2149 3794 1339.4
## - Avg_BPM 1 10215 11860 2446.0
## - Session_Duration..hours. 1 53210 54856 3933.1
# Perform stepwise regression
#model_stepwise <- step(model_wls, direction = "both", trace = 1)
# View the results
summary(model_stepwise)
##
## Call:
## lm(formula = Calories_Burned_transform ~ Age + Gender + Avg_BPM +
## Resting_BPM + Session_Duration..hours. + Fat_Percentage +
## Water_Intake..liters. + BMI, data = gym_clean, weights = weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -5.7774 -0.7771 0.0407 0.8342 4.2272
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -113.46553 3.99614 -28.394 < 2e-16 ***
## Age -0.63706 0.01797 -35.450 < 2e-16 ***
## GenderMale 17.99521 0.63973 28.130 < 2e-16 ***
## Avg_BPM 1.18418 0.01532 77.285 < 2e-16 ***
## Resting_BPM 0.04664 0.02991 1.559 0.119248
## Session_Duration..hours. 143.59637 0.81407 176.393 < 2e-16 ***
## Fat_Percentage 0.20343 0.05258 3.869 0.000117 ***
## Water_Intake..liters. -0.87776 0.55164 -1.591 0.111897
## BMI 0.05862 0.03432 1.708 0.088007 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.308 on 962 degrees of freedom
## Multiple R-squared: 0.9837, Adjusted R-squared: 0.9835
## F-statistic: 7245 on 8 and 962 DF, p-value: < 2.2e-16
Residual Standard Error (1.308): This represents the average amount by which the observed values differ from the model’s predicted values. A lower value indicates better model fit. Given the scale of the data, a residual standard error of 1.308 suggests good accuracy.
Multiple R-squared (0.9837): This indicates that 98.37% of the variance in the dependent variable is explained by the predictors in the model. It reflects a very strong fit.
Adjusted R-squared (0.9835): This adjusts for the number of predictors in the model and is very close to the Multiple R-squared. This suggests that the model is not overfitting and includes meaningful predictors.
F-statistic (7245): A very high F-statistic value with a p-value less than 2.2e-16 indicates that the overall model is statistically significant. This means that the predictors collectively explain a significant portion of the variability in the dependent variable.
library(car)
# Check VIF
vif(model_stepwise)
## Age Gender Avg_BPM
## 1.005763 2.141175 1.005782
## Resting_BPM Session_Duration..hours. Fat_Percentage
## 1.005929 1.626399 2.113451
## Water_Intake..liters. BMI
## 2.270366 1.109606
The Variance Inflation Factor (VIF) values for all predictors are below 5, indicating that multicollinearity is not an issue in the model. This ensures that the regression coefficients are reliable, and no predictor overly influences others due to high inter-correlation. The model is well-specified in terms of variable independence.
Moreover, we were uncomfortable with this as a few of the coefficients still seem to be too extreme.
We proceed to test for the 4 assumptions for the model_stepwise anyway, to see if the model can be further improved.
# Extract residuals from the model
residuals3 <- residuals(model_stepwise)
# Create the Q-Q plot
qqnorm(residuals3)
qqline(residuals3, col = "red") # Add a reference line
# Shapiro-Wilk test
shapiro.test(residuals(model_stepwise))
##
## Shapiro-Wilk normality test
##
## data: residuals(model_stepwise)
## W = 0.98944, p-value = 1.896e-06
The high W value of 0.98944 along with the points mostly lying along the red line suggest that the normality assumption is satisfied.
library(car)
durbinWatsonTest(model_stepwise)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.06222019 2.122511 0.06
## Alternative hypothesis: rho != 0
The non significant p-value (p = 0.06 > 0.05) suggest that there is no evidence of autocorrelation. This confirms that the residuals are independent, which satisfies the assumption of independence for linear regression.
crPlots(model_stepwise)
Overall, the variables show no significant deviations from the pink line, with the mild exceptions of a few, such as age. Since those that suggest non-linearity are quite minor deviations from the line, we can assume that linearity is satisfied.
ncvTest(model_stepwise)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 2.126412, Df = 1, p = 0.14478
spreadLevelPlot(model_stepwise)
##
## Suggested power transformation: 1.132583
Since the p-value (0.14478) is greater than 0.05, there is insufficient evidence to reject the null hypothesis. This indicates that the assumption of homoskedasticity is satisfied. Moreover, the plot shows a roughly horizontal trend, confirming that the issue of heteroskedasticity has been addressed. Thus, the transformed data now meets the assumption of constant variance.
To solve the issue of having overly large coefficients, we decided to standardise the model.
# Standardize predictors
gym_clean_standardized <- gym_clean
gym_clean_standardized[, c("Age", "Avg_BPM", "Resting_BPM", "Session_Duration..hours.",
"Fat_Percentage", "Water_Intake..liters.", "BMI")] <-
scale(gym_clean[, c("Age", "Avg_BPM", "Resting_BPM",
"Session_Duration..hours.", "Fat_Percentage",
"Water_Intake..liters.", "BMI")])
# Re-run WLS model with standardized predictors
model_wls_standardized <- lm(Calories_Burned_transform ~ .,
data = gym_clean_standardized, weights = weights)
# Stepwise regression
model_stepwise_standardized <- step(model_wls_standardized, direction = "both", trace = TRUE)
## Start: AIC=-1823.22
## Calories_Burned_transform ~ Age + Gender + Weight..kg. + Height..m. +
## Max_BPM + Avg_BPM + Resting_BPM + Session_Duration..hours. +
## Calories_Burned + Workout_Type + Fat_Percentage + Water_Intake..liters. +
## Workout_Frequency..days.week. + Experience_Level + BMI
##
## Df Sum of Sq RSS AIC
## - Workout_Type 3 0.24 143.35 -1827.56
## - Max_BPM 1 0.01 143.12 -1825.14
## - Experience_Level 1 0.06 143.17 -1824.79
## - Resting_BPM 1 0.07 143.18 -1824.72
## - BMI 1 0.09 143.20 -1824.61
## - Height..m. 1 0.10 143.20 -1824.56
## - Workout_Frequency..days.week. 1 0.12 143.23 -1824.40
## - Weight..kg. 1 0.12 143.23 -1824.39
## <none> 143.10 -1823.22
## - Water_Intake..liters. 1 3.17 146.27 -1803.95
## - Age 1 13.95 157.05 -1734.93
## - Avg_BPM 1 29.88 172.98 -1641.10
## - Gender 1 32.38 175.48 -1627.18
## - Fat_Percentage 1 32.57 175.68 -1626.09
## - Session_Duration..hours. 1 52.96 196.07 -1519.48
## - Calories_Burned 1 1482.92 1626.03 534.62
##
## Step: AIC=-1827.56
## Calories_Burned_transform ~ Age + Gender + Weight..kg. + Height..m. +
## Max_BPM + Avg_BPM + Resting_BPM + Session_Duration..hours. +
## Calories_Burned + Fat_Percentage + Water_Intake..liters. +
## Workout_Frequency..days.week. + Experience_Level + BMI
##
## Df Sum of Sq RSS AIC
## - Max_BPM 1 0.01 143.36 -1829.49
## - Resting_BPM 1 0.06 143.41 -1829.14
## - Experience_Level 1 0.08 143.43 -1829.02
## - BMI 1 0.09 143.44 -1828.93
## - Height..m. 1 0.10 143.45 -1828.86
## - Workout_Frequency..days.week. 1 0.13 143.48 -1828.70
## - Weight..kg. 1 0.13 143.48 -1828.69
## <none> 143.35 -1827.56
## + Workout_Type 3 0.24 143.10 -1823.22
## - Water_Intake..liters. 1 3.12 146.47 -1808.64
## - Age 1 13.85 157.20 -1740.02
## - Avg_BPM 1 29.74 173.09 -1646.48
## - Gender 1 32.41 175.76 -1631.64
## - Fat_Percentage 1 32.65 176.00 -1630.34
## - Session_Duration..hours. 1 52.94 196.29 -1524.37
## - Calories_Burned 1 1491.82 1635.17 534.06
##
## Step: AIC=-1829.49
## Calories_Burned_transform ~ Age + Gender + Weight..kg. + Height..m. +
## Avg_BPM + Resting_BPM + Session_Duration..hours. + Calories_Burned +
## Fat_Percentage + Water_Intake..liters. + Workout_Frequency..days.week. +
## Experience_Level + BMI
##
## Df Sum of Sq RSS AIC
## - Resting_BPM 1 0.06 143.42 -1831.08
## - Experience_Level 1 0.08 143.44 -1830.96
## - BMI 1 0.09 143.45 -1830.85
## - Height..m. 1 0.10 143.46 -1830.78
## - Workout_Frequency..days.week. 1 0.12 143.48 -1830.65
## - Weight..kg. 1 0.13 143.49 -1830.61
## <none> 143.36 -1829.49
## + Max_BPM 1 0.01 143.35 -1827.56
## + Workout_Type 3 0.24 143.12 -1825.14
## - Water_Intake..liters. 1 3.11 146.47 -1810.63
## - Age 1 13.86 157.22 -1741.89
## - Avg_BPM 1 29.73 173.09 -1648.48
## - Gender 1 32.40 175.76 -1633.64
## - Fat_Percentage 1 32.66 176.02 -1632.20
## - Session_Duration..hours. 1 52.95 196.31 -1526.27
## - Calories_Burned 1 1491.81 1635.17 532.06
##
## Step: AIC=-1831.08
## Calories_Burned_transform ~ Age + Gender + Weight..kg. + Height..m. +
## Avg_BPM + Session_Duration..hours. + Calories_Burned + Fat_Percentage +
## Water_Intake..liters. + Workout_Frequency..days.week. + Experience_Level +
## BMI
##
## Df Sum of Sq RSS AIC
## - Experience_Level 1 0.08 143.50 -1832.53
## - BMI 1 0.09 143.51 -1832.47
## - Height..m. 1 0.10 143.52 -1832.41
## - Workout_Frequency..days.week. 1 0.13 143.55 -1832.23
## - Weight..kg. 1 0.13 143.55 -1832.22
## <none> 143.42 -1831.08
## + Resting_BPM 1 0.06 143.36 -1829.49
## + Max_BPM 1 0.01 143.41 -1829.14
## + Workout_Type 3 0.23 143.19 -1826.65
## - Water_Intake..liters. 1 3.11 146.53 -1812.24
## - Age 1 13.96 157.39 -1742.86
## - Avg_BPM 1 29.87 173.29 -1649.39
## - Gender 1 32.42 175.84 -1635.21
## - Fat_Percentage 1 32.71 176.13 -1633.62
## - Session_Duration..hours. 1 53.45 196.87 -1525.48
## - Calories_Burned 1 1496.38 1639.80 532.81
##
## Step: AIC=-1832.53
## Calories_Burned_transform ~ Age + Gender + Weight..kg. + Height..m. +
## Avg_BPM + Session_Duration..hours. + Calories_Burned + Fat_Percentage +
## Water_Intake..liters. + Workout_Frequency..days.week. + BMI
##
## Df Sum of Sq RSS AIC
## - Workout_Frequency..days.week. 1 0.05 143.55 -1834.19
## - BMI 1 0.09 143.60 -1833.89
## - Height..m. 1 0.10 143.60 -1833.83
## - Weight..kg. 1 0.13 143.63 -1833.64
## <none> 143.50 -1832.53
## + Experience_Level 1 0.08 143.42 -1831.08
## + Resting_BPM 1 0.06 143.44 -1830.96
## + Max_BPM 1 0.01 143.49 -1830.58
## + Workout_Type 3 0.25 143.26 -1828.20
## - Water_Intake..liters. 1 3.15 146.65 -1813.43
## - Age 1 13.91 157.41 -1744.72
## - Avg_BPM 1 29.83 173.33 -1651.15
## - Gender 1 32.99 176.49 -1633.60
## - Fat_Percentage 1 37.51 181.01 -1609.05
## - Session_Duration..hours. 1 53.71 197.21 -1525.82
## - Calories_Burned 1 1497.11 1640.62 531.29
##
## Step: AIC=-1834.19
## Calories_Burned_transform ~ Age + Gender + Weight..kg. + Height..m. +
## Avg_BPM + Session_Duration..hours. + Calories_Burned + Fat_Percentage +
## Water_Intake..liters. + BMI
##
## Df Sum of Sq RSS AIC
## - BMI 1 0.09 143.64 -1835.62
## - Height..m. 1 0.09 143.65 -1835.56
## - Weight..kg. 1 0.12 143.67 -1835.37
## <none> 143.55 -1834.19
## + Resting_BPM 1 0.06 143.49 -1832.62
## + Workout_Frequency..days.week. 1 0.05 143.50 -1832.53
## + Max_BPM 1 0.01 143.55 -1832.23
## + Experience_Level 1 0.00 143.55 -1832.23
## + Workout_Type 3 0.24 143.31 -1829.83
## - Water_Intake..liters. 1 3.13 146.69 -1815.22
## - Age 1 13.88 157.43 -1746.61
## - Avg_BPM 1 29.80 173.35 -1653.03
## - Gender 1 33.20 176.75 -1634.19
## - Fat_Percentage 1 39.89 183.45 -1598.08
## - Session_Duration..hours. 1 54.44 197.99 -1523.99
## - Calories_Burned 1 1497.27 1640.82 529.41
##
## Step: AIC=-1835.62
## Calories_Burned_transform ~ Age + Gender + Weight..kg. + Height..m. +
## Avg_BPM + Session_Duration..hours. + Calories_Burned + Fat_Percentage +
## Water_Intake..liters.
##
## Df Sum of Sq RSS AIC
## - Height..m. 1 0.01 143.65 -1837.56
## - Weight..kg. 1 0.17 143.80 -1836.49
## <none> 143.64 -1835.62
## + BMI 1 0.09 143.55 -1834.19
## + Resting_BPM 1 0.06 143.58 -1834.02
## + Workout_Frequency..days.week. 1 0.04 143.60 -1833.89
## + Experience_Level 1 0.01 143.63 -1833.67
## + Max_BPM 1 0.01 143.63 -1833.66
## + Workout_Type 3 0.25 143.39 -1831.28
## - Water_Intake..liters. 1 3.12 146.76 -1816.73
## - Age 1 14.25 157.89 -1745.76
## - Avg_BPM 1 30.18 173.81 -1652.45
## - Gender 1 33.23 176.86 -1635.56
## - Fat_Percentage 1 39.92 183.55 -1599.51
## - Session_Duration..hours. 1 55.14 198.78 -1522.15
## - Calories_Burned 1 1505.96 1649.60 532.59
##
## Step: AIC=-1837.56
## Calories_Burned_transform ~ Age + Gender + Weight..kg. + Avg_BPM +
## Session_Duration..hours. + Calories_Burned + Fat_Percentage +
## Water_Intake..liters.
##
## Df Sum of Sq RSS AIC
## - Weight..kg. 1 0.16 143.81 -1838.5
## <none> 143.65 -1837.6
## + Resting_BPM 1 0.06 143.59 -1836.0
## + Workout_Frequency..days.week. 1 0.04 143.60 -1835.8
## + Height..m. 1 0.01 143.64 -1835.6
## + Experience_Level 1 0.01 143.64 -1835.6
## + Max_BPM 1 0.01 143.64 -1835.6
## + BMI 1 0.00 143.65 -1835.6
## + Workout_Type 3 0.25 143.40 -1833.2
## - Water_Intake..liters. 1 3.13 146.77 -1818.7
## - Age 1 14.24 157.89 -1747.8
## - Avg_BPM 1 30.20 173.85 -1654.3
## - Gender 1 37.07 180.72 -1616.6
## - Fat_Percentage 1 39.91 183.56 -1601.5
## - Session_Duration..hours. 1 55.15 198.79 -1524.1
## - Calories_Burned 1 1505.96 1649.61 530.6
##
## Step: AIC=-1838.45
## Calories_Burned_transform ~ Age + Gender + Avg_BPM + Session_Duration..hours. +
## Calories_Burned + Fat_Percentage + Water_Intake..liters.
##
## Df Sum of Sq RSS AIC
## <none> 143.81 -1838.45
## + Weight..kg. 1 0.16 143.65 -1837.56
## + BMI 1 0.13 143.68 -1837.32
## + Resting_BPM 1 0.07 143.74 -1836.92
## + Workout_Frequency..days.week. 1 0.04 143.77 -1836.73
## + Max_BPM 1 0.01 143.80 -1836.53
## + Experience_Level 1 0.01 143.80 -1836.49
## + Height..m. 1 0.01 143.80 -1836.49
## + Workout_Type 3 0.26 143.55 -1834.22
## - Water_Intake..liters. 1 3.09 146.90 -1819.79
## - Age 1 14.27 158.08 -1748.59
## - Avg_BPM 1 30.08 173.89 -1656.04
## - Fat_Percentage 1 40.03 183.84 -1602.01
## - Gender 1 43.72 187.53 -1582.68
## - Session_Duration..hours. 1 54.99 198.80 -1526.02
## - Calories_Burned 1 1510.13 1653.94 531.15
# View summary
summary(model_stepwise_standardized)
##
## Call:
## lm(formula = Calories_Burned_transform ~ Age + Gender + Avg_BPM +
## Session_Duration..hours. + Calories_Burned + Fat_Percentage +
## Water_Intake..liters., data = gym_clean_standardized, weights = weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -1.48297 -0.17618 0.06248 0.25218 0.81353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.621649 1.453472 50.652 < 2e-16 ***
## Age -0.918894 0.094006 -9.775 < 2e-16 ***
## GenderMale 3.980878 0.232649 17.111 < 2e-16 ***
## Avg_BPM 2.272030 0.160092 14.192 < 2e-16 ***
## Session_Duration..hours. 7.998440 0.416807 19.190 < 2e-16 ***
## Calories_Burned 0.169504 0.001686 100.560 < 2e-16 ***
## Fat_Percentage 1.585146 0.096821 16.372 < 2e-16 ***
## Water_Intake..liters. -0.444620 0.097711 -4.550 6.04e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3864 on 963 degrees of freedom
## Multiple R-squared: 0.9986, Adjusted R-squared: 0.9986
## F-statistic: 9.625e+04 on 7 and 963 DF, p-value: < 2.2e-16
The regression model demonstrates an outstanding fit, with a Multiple R-squared of 0.9986 and an Adjusted R-squared of 0.9986, indicating that the predictors collectively explain 99.86% of the variance in the dependent variable.
The residual standard error of 0.3864 suggests highly accurate predictions.
The F-statistic of 96,250 and its associated p-value (< 2.2e-16) confirm that the model is statistically significant, with the predictors playing a crucial role in explaining the variability in the response variable. This highlights the robustness and reliability of the model.
# Extract residuals from the model
residuals4 <- residuals(model_stepwise_standardized)
# Create the Q-Q plot
qqnorm(residuals4)
qqline(residuals4, col = "red") # Add a reference line
# Shapiro-Wilk test
shapiro.test(residuals(model_stepwise_standardized))
##
## Shapiro-Wilk normality test
##
## data: residuals(model_stepwise_standardized)
## W = 0.94183, p-value < 2.2e-16
W = 0.94183: This is the test statistic, where values closer to 1 indicate normality.
QQ Plot: The points at the tails seem to deviate too much from the line.
p-value: Much smaller than 0.05.
Hence, normality is not satisfied, which is not an improvement from the non-standardised stepwise model.
library(car)
durbinWatsonTest(model_stepwise_standardized)
## lag Autocorrelation D-W Statistic p-value
## 1 0.0449558 1.909334 0.16
## Alternative hypothesis: rho != 0
The non significant p-value (p = 0.16 > 0.05) suggest that there is no evidence of autocorrelation. This confirms that the residuals are independent, which satisfies the assumption of independence for linear regression.
crPlots(model_stepwise_standardized)
Overall, the variables show no significant deviations from the pink line, with the mild exceptions of a few. Since those that suggest non-linearity are quite minor deviations from the line, we can assume that linearity is satisfied.
ncvTest(model_stepwise_standardized)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 1.928239, Df = 1, p = 0.16495
spreadLevelPlot(model_stepwise_standardized)
##
## Suggested power transformation: 1.536417
Since the p-value (0.16495) is greater than 0.05, there is insufficient evidence to reject the null hypothesis, indicating that the assumption of homoskedasticity is satisfied. Moreover, the plot shows a roughly horizontal trend, confirming that the issue of heteroskedasticity has been resolved.
model_stepwise: Coefficients seem too large, and the linearity assumption is violated.
model_stepwise_standardised: Normality assumption is violated, although the coefficients and assumption violations have improved due to scaling and standardisation.
library(leaps)
# Fit the regsubsets model
model_subset <- regsubsets(Calories_Burned_transform ~ Age + Gender + Max_BPM + Avg_BPM + Resting_BPM +
Session_Duration..hours. + Workout_Type + Fat_Percentage + Water_Intake..liters. +
Workout_Frequency..days.week. + Experience_Level + BMI,
data = gym_clean,
weights = weights)
# Display the results
summary(model_subset)
## Subset selection object
## Call: regsubsets.formula(Calories_Burned_transform ~ Age + Gender +
## Max_BPM + Avg_BPM + Resting_BPM + Session_Duration..hours. +
## Workout_Type + Fat_Percentage + Water_Intake..liters. + Workout_Frequency..days.week. +
## Experience_Level + BMI, data = gym_clean, weights = weights)
## 14 Variables (and intercept)
## Forced in Forced out
## Age FALSE FALSE
## GenderMale FALSE FALSE
## Max_BPM FALSE FALSE
## Avg_BPM FALSE FALSE
## Resting_BPM FALSE FALSE
## Session_Duration..hours. FALSE FALSE
## Workout_TypeHIIT FALSE FALSE
## Workout_TypeStrength FALSE FALSE
## Workout_TypeYoga FALSE FALSE
## Fat_Percentage FALSE FALSE
## Water_Intake..liters. FALSE FALSE
## Workout_Frequency..days.week. FALSE FALSE
## Experience_Level FALSE FALSE
## BMI FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## Age GenderMale Max_BPM Avg_BPM Resting_BPM Session_Duration..hours.
## 1 ( 1 ) " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " "*" " " "*"
## 3 ( 1 ) " " "*" " " "*" " " "*"
## 4 ( 1 ) "*" "*" " " "*" " " "*"
## 5 ( 1 ) "*" "*" " " "*" " " "*"
## 6 ( 1 ) "*" "*" " " "*" " " "*"
## 7 ( 1 ) "*" "*" " " "*" " " "*"
## 8 ( 1 ) "*" "*" " " "*" " " "*"
## Workout_TypeHIIT Workout_TypeStrength Workout_TypeYoga Fat_Percentage
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " "*"
## 6 ( 1 ) " " " " "*" "*"
## 7 ( 1 ) " " " " "*" "*"
## 8 ( 1 ) " " " " "*" "*"
## Water_Intake..liters. Workout_Frequency..days.week. Experience_Level
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) " " " " " "
## 3 ( 1 ) " " " " " "
## 4 ( 1 ) " " " " " "
## 5 ( 1 ) " " " " " "
## 6 ( 1 ) " " " " " "
## 7 ( 1 ) "*" " " " "
## 8 ( 1 ) "*" " " " "
## BMI
## 1 ( 1 ) " "
## 2 ( 1 ) " "
## 3 ( 1 ) " "
## 4 ( 1 ) " "
## 5 ( 1 ) " "
## 6 ( 1 ) " "
## 7 ( 1 ) " "
## 8 ( 1 ) "*"
# Extract the summary of the regsubsets object
summary_subsets <- summary(model_subset)
# Check the metrics (Adjusted R², Cp, BIC)
summary_subsets$adjr2 # Adjusted R² for each subset
## [1] 0.8392668 0.9387154 0.9615520 0.9831291 0.9834584 0.9835116 0.9835402
## [8] 0.9835642
summary_subsets$cp # Cp for each subset
## [1] 8495.921716 2639.312020 1295.894904 29.175818 10.843585 8.720669
## [7] 8.046948 7.643880
summary_subsets$bic # BIC for each subset
## [1] -1762.242 -2692.622 -3139.448 -3933.400 -3946.668 -3943.926 -3939.740
## [8] -3935.288
The Adjusted R² increases as we add more predictors, indicating that the model fit improves with more predictors. The increase seems to stabilise after 4 predictors (around 0.9835), suggesting that adding more predictors beyond this point does not provide a substantial increase in fit.
As we add more predictors, Cp decreases, which is expected, but the rate of decrease slows down. The smallest Cp value is 7.643880 for the model with 8 predictors, which suggests that this model is very close to the optimal size in terms of predictive accuracy. The model with 8 predictors has a Cp value of 7.643880, which is quite close to 8, indicating it is a good model choice.
The BIC decreases as the number of predictors increases, which is typical because adding predictors initially improves the model fit. However, the rate of decrease slows down, and at a certain point, the improvement becomes minimal. The lowest BIC value is -3935.288 for the model with 8 predictors, suggesting that the model with 8 predictors is the best in terms of BIC.
# Display the best subsets for each number of predictors
# This will show you which variables are selected for each subset of predictors
summary_subsets$which
## (Intercept) Age GenderMale Max_BPM Avg_BPM Resting_BPM
## 1 TRUE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE FALSE FALSE FALSE TRUE FALSE
## 3 TRUE FALSE TRUE FALSE TRUE FALSE
## 4 TRUE TRUE TRUE FALSE TRUE FALSE
## 5 TRUE TRUE TRUE FALSE TRUE FALSE
## 6 TRUE TRUE TRUE FALSE TRUE FALSE
## 7 TRUE TRUE TRUE FALSE TRUE FALSE
## 8 TRUE TRUE TRUE FALSE TRUE FALSE
## Session_Duration..hours. Workout_TypeHIIT Workout_TypeStrength
## 1 TRUE FALSE FALSE
## 2 TRUE FALSE FALSE
## 3 TRUE FALSE FALSE
## 4 TRUE FALSE FALSE
## 5 TRUE FALSE FALSE
## 6 TRUE FALSE FALSE
## 7 TRUE FALSE FALSE
## 8 TRUE FALSE FALSE
## Workout_TypeYoga Fat_Percentage Water_Intake..liters.
## 1 FALSE FALSE FALSE
## 2 FALSE FALSE FALSE
## 3 FALSE FALSE FALSE
## 4 FALSE FALSE FALSE
## 5 FALSE TRUE FALSE
## 6 TRUE TRUE FALSE
## 7 TRUE TRUE TRUE
## 8 TRUE TRUE TRUE
## Workout_Frequency..days.week. Experience_Level BMI
## 1 FALSE FALSE FALSE
## 2 FALSE FALSE FALSE
## 3 FALSE FALSE FALSE
## 4 FALSE FALSE FALSE
## 5 FALSE FALSE FALSE
## 6 FALSE FALSE FALSE
## 7 FALSE FALSE FALSE
## 8 FALSE FALSE TRUE
# Get the best model based on Adjusted R², Cp, and BIC
# Best model based on Adjusted R² (maximized)
best_adj_r2_model <- which.max(summary_subsets$adjr2)
cat("Best model based on Adjusted R²: ", best_adj_r2_model, "\n")
## Best model based on Adjusted R²: 8
cat("Predictors selected for Adjusted R²: ", summary_subsets$which[best_adj_r2_model, ], "\n")
## Predictors selected for Adjusted R²: TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE
# Best model based on Cp (minimized)
best_cp_model <- which.min(summary_subsets$cp)
cat("Best model based on Cp: ", best_cp_model, "\n")
## Best model based on Cp: 8
cat("Predictors selected for Cp: ", summary_subsets$which[best_cp_model, ], "\n")
## Predictors selected for Cp: TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE
# Best model based on BIC (minimized)
best_bic_model <- which.min(summary_subsets$bic)
cat("Best model based on BIC: ", best_bic_model, "\n")
## Best model based on BIC: 5
cat("Predictors selected for BIC: ", summary_subsets$which[best_bic_model, ], "\n")
## Predictors selected for BIC: TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
We chose to follow the model suggested by the adjusted R^2 and Cp value.
model_all <- lm(Calories_Burned_transform ~ Age + Gender + Avg_BPM +
Session_Duration..hours. + Workout_Type + Fat_Percentage + Water_Intake..liters. + BMI, data = gym_clean, weights = weights)
summary(model_all)
##
## Call:
## lm(formula = Calories_Burned_transform ~ Age + Gender + Avg_BPM +
## Session_Duration..hours. + Workout_Type + Fat_Percentage +
## Water_Intake..liters. + BMI, data = gym_clean, weights = weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -5.7530 -0.7778 0.0351 0.8517 4.3493
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -109.86709 3.56141 -30.849 < 2e-16 ***
## Age -0.63552 0.01798 -35.339 < 2e-16 ***
## GenderMale 18.11015 0.64121 28.244 < 2e-16 ***
## Avg_BPM 1.18517 0.01530 77.483 < 2e-16 ***
## Session_Duration..hours. 143.60200 0.81531 176.132 < 2e-16 ***
## Workout_TypeHIIT -0.11235 0.62777 -0.179 0.858001
## Workout_TypeStrength -0.73643 0.60414 -1.219 0.223155
## Workout_TypeYoga -1.31381 0.61467 -2.137 0.032814 *
## Fat_Percentage 0.20081 0.05265 3.814 0.000145 ***
## Water_Intake..liters. -0.93902 0.55185 -1.702 0.089158 .
## BMI 0.05075 0.03436 1.477 0.139991
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.307 on 960 degrees of freedom
## Multiple R-squared: 0.9837, Adjusted R-squared: 0.9836
## F-statistic: 5804 on 10 and 960 DF, p-value: < 2.2e-16
# Extract residuals from the model
residuals5 <- residuals(model_all)
# Create the Q-Q plot
qqnorm(residuals5)
qqline(residuals5, col = "red") # Add a reference line
# Shapiro-Wilk test
shapiro.test(residuals(model_all))
##
## Shapiro-Wilk normality test
##
## data: residuals(model_all)
## W = 0.9901, p-value = 4.044e-06
The high W value of 0.9901 along with the points mostly lying along the red line suggest that the normality assumption is satisfied.
library(car)
durbinWatsonTest(model_all)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.0620595 2.12216 0.034
## Alternative hypothesis: rho != 0
The p-value = 0.034 < 0.05 suggest that there is evidence of autocorrelation. This means that the residuals are not independent, which violates the assumption of independence for linear regression.
crPlots(model_all)
Overall, the variables show no significant deviations from the pink line, with the mild exceptions of a few. Since those that suggest non-linearity are quite minor deviations from the line, we can assume that linearity is satisfied.
ncvTest(model_all)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 2.356334, Df = 1, p = 0.12478
spreadLevelPlot(model_all)
##
## Suggested power transformation: 1.113266
Since the p-value (0.12478) is greater than 0.05, there is insufficient evidence to reject the null hypothesis, suggesting that the homoskedasticity assumption is satisfied. Additionally, the plot shows a roughly horizontal trend, confirming that the issue of heteroskedasticity has been addressed.
Now, to solve the issue of having overly large coefficients, again, similar to Stepwise Regression and Stepwise Regression Standardised, we decided to standardise the model.
# Load required library
library(leaps)
# Step 1: Standardize the predictors
gym_clean_standardized <- gym_clean
# Standardize all continuous predictors
gym_clean_standardized[, c("Age", "Max_BPM", "Avg_BPM", "Resting_BPM",
"Session_Duration..hours.", "Fat_Percentage",
"Water_Intake..liters.", "Workout_Frequency..days.week.", "BMI")] <-
scale(gym_clean[, c("Age", "Max_BPM", "Avg_BPM", "Resting_BPM",
"Session_Duration..hours.", "Fat_Percentage",
"Water_Intake..liters.", "Workout_Frequency..days.week.", "BMI")])
# Step 2: Fit the regsubsets model on standardized predictors
model_subset <- regsubsets(Calories_Burned_transform ~ Age + Gender + Max_BPM + Avg_BPM + Resting_BPM +
Session_Duration..hours. + Workout_Type + Fat_Percentage + Water_Intake..liters. +
Workout_Frequency..days.week. + Experience_Level + BMI,
data = gym_clean_standardized,
weights = weights,
nvmax = 10, # Maximum number of predictors
method = "exhaustive") # Exhaustive search
# Step 3: Extract and display results
summary_subsets <- summary(model_subset)
# Metrics for model comparison
cat("Adjusted R²:\n")
## Adjusted R²:
print(summary_subsets$adjr2)
## [1] 0.8392668 0.9387154 0.9615520 0.9831291 0.9834584 0.9835116 0.9835402
## [8] 0.9835642 0.9835849 0.9835967
cat("\nMallows' Cp:\n")
##
## Mallows' Cp:
print(summary_subsets$cp)
## [1] 8495.921716 2639.312020 1295.894904 29.175818 10.843585 8.720669
## [7] 8.046948 7.643880 7.437859 7.748079
cat("\nBIC:\n")
##
## BIC:
print(summary_subsets$bic)
## [1] -1762.242 -2692.622 -3139.448 -3933.400 -3946.668 -3943.926 -3939.740
## [8] -3935.288 -3930.642 -3925.477
# Step 4: Plot selection criteria
par(mfrow = c(1, 3)) # Set up 3 plots in one row
plot(model_subset, scale = "adjr2", main = "Adjusted R²")
plot(model_subset, scale = "Cp", main = "Mallows' Cp")
plot(model_subset, scale = "bic", main = "BIC")
# Step 5: Display the best subset
best_model <- coef(model_subset, which.max(summary_subsets$adjr2)) # Model with highest Adjusted R²
print("Coefficients of the Best Model:")
## [1] "Coefficients of the Best Model:"
print(best_model)
## (Intercept) Age GenderMale
## 219.8726866 -7.7491304 18.1012924
## Avg_BPM Resting_BPM Session_Duration..hours.
## 16.9658228 0.3274093 49.1179808
## Workout_TypeStrength Workout_TypeYoga Fat_Percentage
## -0.6893538 -1.2349161 1.2649733
## Water_Intake..liters. BMI
## -0.5598955 0.3539094
# Display the best subsets for each number of predictors
# This will show which variables are selected for each subset
cat("Best Subsets for Each Number of Predictors:\n")
## Best Subsets for Each Number of Predictors:
print(summary_subsets$which)
## (Intercept) Age GenderMale Max_BPM Avg_BPM Resting_BPM
## 1 TRUE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE FALSE FALSE FALSE TRUE FALSE
## 3 TRUE FALSE TRUE FALSE TRUE FALSE
## 4 TRUE TRUE TRUE FALSE TRUE FALSE
## 5 TRUE TRUE TRUE FALSE TRUE FALSE
## 6 TRUE TRUE TRUE FALSE TRUE FALSE
## 7 TRUE TRUE TRUE FALSE TRUE FALSE
## 8 TRUE TRUE TRUE FALSE TRUE FALSE
## 9 TRUE TRUE TRUE FALSE TRUE TRUE
## 10 TRUE TRUE TRUE FALSE TRUE TRUE
## Session_Duration..hours. Workout_TypeHIIT Workout_TypeStrength
## 1 TRUE FALSE FALSE
## 2 TRUE FALSE FALSE
## 3 TRUE FALSE FALSE
## 4 TRUE FALSE FALSE
## 5 TRUE FALSE FALSE
## 6 TRUE FALSE FALSE
## 7 TRUE FALSE FALSE
## 8 TRUE FALSE FALSE
## 9 TRUE FALSE FALSE
## 10 TRUE FALSE TRUE
## Workout_TypeYoga Fat_Percentage Water_Intake..liters.
## 1 FALSE FALSE FALSE
## 2 FALSE FALSE FALSE
## 3 FALSE FALSE FALSE
## 4 FALSE FALSE FALSE
## 5 FALSE TRUE FALSE
## 6 TRUE TRUE FALSE
## 7 TRUE TRUE TRUE
## 8 TRUE TRUE TRUE
## 9 TRUE TRUE TRUE
## 10 TRUE TRUE TRUE
## Workout_Frequency..days.week. Experience_Level BMI
## 1 FALSE FALSE FALSE
## 2 FALSE FALSE FALSE
## 3 FALSE FALSE FALSE
## 4 FALSE FALSE FALSE
## 5 FALSE FALSE FALSE
## 6 FALSE FALSE FALSE
## 7 FALSE FALSE FALSE
## 8 FALSE FALSE TRUE
## 9 FALSE FALSE TRUE
## 10 FALSE FALSE TRUE
# Best model based on Adjusted R² (maximized)
best_adj_r2_model <- which.max(summary_subsets$adjr2)
cat("\nBest model based on Adjusted R²:\n")
##
## Best model based on Adjusted R²:
cat("Number of Predictors:", best_adj_r2_model, "\n")
## Number of Predictors: 10
cat("Predictors selected:\n")
## Predictors selected:
print(names(summary_subsets$which[best_adj_r2_model, ][summary_subsets$which[best_adj_r2_model, ] == TRUE]))
## [1] "(Intercept)" "Age"
## [3] "GenderMale" "Avg_BPM"
## [5] "Resting_BPM" "Session_Duration..hours."
## [7] "Workout_TypeStrength" "Workout_TypeYoga"
## [9] "Fat_Percentage" "Water_Intake..liters."
## [11] "BMI"
# Best model based on Cp (minimized)
best_cp_model <- which.min(summary_subsets$cp)
cat("\nBest model based on Cp:\n")
##
## Best model based on Cp:
cat("Number of Predictors:", best_cp_model, "\n")
## Number of Predictors: 9
cat("Predictors selected:\n")
## Predictors selected:
print(names(summary_subsets$which[best_cp_model, ][summary_subsets$which[best_cp_model, ] == TRUE]))
## [1] "(Intercept)" "Age"
## [3] "GenderMale" "Avg_BPM"
## [5] "Resting_BPM" "Session_Duration..hours."
## [7] "Workout_TypeYoga" "Fat_Percentage"
## [9] "Water_Intake..liters." "BMI"
# Best model based on BIC (minimized)
best_bic_model <- which.min(summary_subsets$bic)
cat("\nBest model based on BIC:\n")
##
## Best model based on BIC:
cat("Number of Predictors:", best_bic_model, "\n")
## Number of Predictors: 5
cat("Predictors selected:\n")
## Predictors selected:
print(names(summary_subsets$which[best_bic_model, ][summary_subsets$which[best_bic_model, ] == TRUE]))
## [1] "(Intercept)" "Age"
## [3] "GenderMale" "Avg_BPM"
## [5] "Session_Duration..hours." "Fat_Percentage"
# Fit the final model using standardized predictors
model_all_standardized <- lm(Calories_Burned_transform ~ Age + Gender + Avg_BPM + Resting_BPM + Session_Duration..hours. + Workout_Type + Fat_Percentage + Water_Intake..liters. + BMI,
data = gym_clean_standardized, weights = weights)
# View the summary of the final model
summary(model_all_standardized)
##
## Call:
## lm(formula = Calories_Burned_transform ~ Age + Gender + Avg_BPM +
## Resting_BPM + Session_Duration..hours. + Workout_Type + Fat_Percentage +
## Water_Intake..liters. + BMI, data = gym_clean_standardized,
## weights = weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -5.6381 -0.7873 0.0456 0.8671 4.2634
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 219.9405 0.5295 415.388 < 2e-16 ***
## Age -7.7471 0.2191 -35.358 < 2e-16 ***
## GenderMale 18.1029 0.6408 28.250 < 2e-16 ***
## Avg_BPM 16.9650 0.2193 77.344 < 2e-16 ***
## Resting_BPM 0.3294 0.2192 1.503 0.133212
## Session_Duration..hours. 49.1215 0.2788 176.178 < 2e-16 ***
## Workout_TypeHIIT -0.1482 0.6278 -0.236 0.813468
## Workout_TypeStrength -0.7578 0.6039 -1.255 0.209838
## Workout_TypeYoga -1.3034 0.6143 -2.122 0.034108 *
## Fat_Percentage 1.2640 0.3279 3.855 0.000124 ***
## Water_Intake..liters. -0.5609 0.3306 -1.697 0.090110 .
## BMI 0.3533 0.2291 1.542 0.123428
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.306 on 959 degrees of freedom
## Multiple R-squared: 0.9838, Adjusted R-squared: 0.9836
## F-statistic: 5283 on 11 and 959 DF, p-value: < 2.2e-16
The regression model demonstrates an excellent fit, with a Multiple R-squared of 0.9838 and an Adjusted R-squared of 0.9836, indicating that the predictors collectively explain 98.38% of the variability in the dependent variable.
The residual standard error of 1.306 suggests that the predictions are highly accurate.
The high F-statistic (5283) and its associated p-value (< 2.2e-16) confirm that the model is statistically significant, with the predictors playing a crucial role in explaining the variability in the response variable.
# Extract residuals from the model
residuals5 <- residuals(model_all_standardized)
# Create the Q-Q plot
qqnorm(residuals5)
qqline(residuals5, col = "red") # Add a reference line
# Shapiro-Wilk test
shapiro.test(residuals(model_all_standardized))
##
## Shapiro-Wilk normality test
##
## data: residuals(model_all_standardized)
## W = 0.99033, p-value = 5.269e-06
The high W value of 0.99033 along with the points mostly lying along the red line suggest that the normality assumption is satisfied.
library(car)
durbinWatsonTest(model_all_standardized)
## lag Autocorrelation D-W Statistic p-value
## 1 -0.06079344 2.119547 0.058
## Alternative hypothesis: rho != 0
The non significant p-value (p = 0.058 > 0.05) suggest that there is no evidence of autocorrelation. This confirms that the residuals are independent, which satisfies the assumption of independence for linear regression.
crPlots(model_all_standardized)
Overall, the variables show no significant deviations from the pink line, with the mild exceptions of a few. Since those that suggest non-linearity are quite minor deviations from the line, we can assume that linearity is satisfied.
ncvTest(model_all_standardized)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 2.026512, Df = 1, p = 0.15458
spreadLevelPlot(model_all_standardized)
##
## Suggested power transformation: 1.107399
Since the p-value (0.15458) is greater than 0.05, there is insufficient evidence to reject the null hypothesis, indicating that the assumption of homoskedasticity is satisfied. Additionally, the plot displays a roughly horizontal trend, confirming that the issue of heteroskedasticity has been addressed. Thus, the transformed data now meets the assumption of constant variance.
Non-standardised All Subset Regression Model: Not all the assumptions are satisfied.
Standardised All Subset Regression Model: All the assumptions are satisfied but the coefficients still seem oddly large after standardisation.
Comparing the 4 models that we have (model_box, model_wls, model_stepwise/model_stepwise_standardized and model_all/model_all_standardized), it seems the best to use model_wls. This is because the absolute value of coefficients makes more sense in real life, while still satisfying the 4 basic assumptions with a high R^2 value.
# Extract the coefficients (standardized) from the model
coefficients <- coef(model_wls)
# Remove the intercept from the coefficients
coefficients_no_intercept <- coefficients[-1] # Exclude the intercept
# Calculate the absolute values of the coefficients (standardized)
abs_coefficients <- abs(coefficients_no_intercept)
# Calculate relative importance as a percentage of the total importance
relative_importance <- abs_coefficients / sum(abs_coefficients) * 100
# Create a summary table
importance_table <- data.frame(
Predictor = names(abs_coefficients),
Coefficient = coefficients_no_intercept,
Relative_Importance = relative_importance
)
# Print the table with relative importance
print(importance_table)
## Predictor Coefficient
## Age Age -6.361160e-01
## GenderMale GenderMale 1.810597e+01
## Max_BPM Max_BPM 3.888352e-05
## Avg_BPM Avg_BPM 1.184123e+00
## Resting_BPM Resting_BPM 4.535562e-02
## Session_Duration..hours. Session_Duration..hours. 1.436741e+02
## Workout_TypeHIIT Workout_TypeHIIT -1.571721e-01
## Workout_TypeStrength Workout_TypeStrength -7.827747e-01
## Workout_TypeYoga Workout_TypeYoga -1.299503e+00
## Fat_Percentage Fat_Percentage 2.012208e-01
## Water_Intake..liters. Water_Intake..liters. -9.321678e-01
## Workout_Frequency..days.week. Workout_Frequency..days.week. 3.646928e-01
## Experience_Level Experience_Level -4.070368e-01
## BMI BMI 5.331394e-02
## Relative_Importance
## Age 3.789933e-01
## GenderMale 1.078740e+01
## Max_BPM 2.316652e-05
## Avg_BPM 7.054918e-01
## Resting_BPM 2.702255e-02
## Session_Duration..hours. 8.560000e+01
## Workout_TypeHIIT 9.364197e-02
## Workout_TypeStrength 4.663715e-01
## Workout_TypeYoga 7.742345e-01
## Fat_Percentage 1.198859e-01
## Water_Intake..liters. 5.553788e-01
## Workout_Frequency..days.week. 2.172813e-01
## Experience_Level 2.425096e-01
## BMI 3.176406e-02
We can see from the table that Session_Duration is the most important predictor, followed by Gender.
Next, we conducted histogram smoothing to reduce noise and create a clearer representation of the data’s distribution.
# Set up the plotting area to display histograms for continuous predictors
par(mfrow = c(1, 1))
par(cex.axis = 0.8, cex.lab = 0.8, cex.main = 0.9)
# Age - Create Histogram and Overlay Density
hist(gym_clean$Age, breaks = 20, probability = TRUE, col = 0,
ylim = c(0, 0.05), xlab = "Age", ylab = "Density", main = "Histogram with Smoothing for Age")
density_est_age <- density(gym_clean$Age)
lines(density_est_age, col = "blue")
# Max_BPM - Create Histogram and Overlay Density
hist(gym_clean$Max_BPM, breaks = 20, probability = TRUE, col = 0,
ylim = c(0, 0.05), xlab = "Max_BPM", ylab = "Density", main = "Histogram with Smoothing for Max_BPM")
density_est_max_bpm <- density(gym_clean$Max_BPM)
lines(density_est_max_bpm, col = "blue")
# Avg_BPM - Create Histogram and Overlay Density
hist(gym_clean$Avg_BPM, breaks = 20, probability = TRUE, col = 0,
ylim = c(0, 0.05), xlab = "Avg_BPM", ylab = "Density", main = "Histogram with Smoothing for Avg_BPM")
density_est_avg_bpm <- density(gym_clean$Avg_BPM)
lines(density_est_avg_bpm, col = "blue")
# Resting_BPM - Create Histogram and Overlay Density
hist(gym_clean$Resting_BPM, breaks = 20, probability = TRUE, col = 0,
ylim = c(0, 0.2), xlab = "Resting_BPM", ylab = "Density", main = "Histogram with Smoothing for Resting_BPM")
density_est_resting_bpm <- density(gym_clean$Resting_BPM)
lines(density_est_resting_bpm, col = "blue")
# Session_Duration..hours. - Create Histogram and Overlay Density
hist(gym_clean$Session_Duration..hours., breaks = 20, probability = TRUE, col = 0,
ylim = c(0, 1.5), xlab = "Session Duration (hours)", ylab = "Density", main = "Histogram with Smoothing for Session Duration")
density_est_duration <- density(gym_clean$Session_Duration..hours.)
lines(density_est_duration, col = "blue")
# Fat_Percentage - Create Histogram and Overlay Density
hist(gym_clean$Fat_Percentage, breaks = 20, probability = TRUE, col = 0,
ylim = c(0, 0.1), xlab = "Fat_Percentage", ylab = "Density", main = "Histogram with Smoothing for Fat_Percentage")
density_est_fat_percentage <- density(gym_clean$Fat_Percentage)
lines(density_est_fat_percentage, col = "blue")
# Water_Intake..liters. - Create Histogram and Overlay Density
hist(gym_clean$Water_Intake..liters., breaks = 20, probability = TRUE, col = 0,
ylim = c(0, 1.5), xlab = "Water Intake (liters)", ylab = "Density", main = "Histogram with Smoothing for Water Intake")
density_est_water_intake <- density(gym_clean$Water_Intake..liters.)
lines(density_est_water_intake, col = "blue")
# Workout_Frequency..days.week. - Create Histogram and Overlay Density
hist(gym_clean$Workout_Frequency..days.week., breaks = 10, probability = TRUE, col = 0,
ylim = c(0, 1.5), xlab = "Workout Frequency (days/week)", ylab = "Density", main = "Histogram with Smoothing for Workout Frequency")
density_est_workout_frequency <- density(gym_clean$Workout_Frequency..days.week.)
lines(density_est_workout_frequency, col = "blue")
# Experience_Level - Create Histogram and Overlay Density
hist(gym_clean$Experience_Level, breaks = 5, probability = TRUE, col = 0,
ylim = c(0, 1.5), xlab = "Experience Level", ylab = "Density", main = "Histogram with Smoothing for Experience Level")
density_est_experience_level <- density(gym_clean$Experience_Level)
lines(density_est_experience_level, col = "blue")
# BMI - Create Histogram and Overlay Density
hist(gym_clean$BMI, breaks = 20, probability = TRUE, col = 0,
ylim = c(0, 0.1), xlab = "BMI", ylab = "Density", main = "Histogram with Smoothing for BMI")
density_est_bmi <- density(gym_clean$BMI)
lines(density_est_bmi, col = "blue")
# Extract residuals from the model
residuals_model <- residuals(model_wls)
# Plot the kernel density estimate of the residuals using different kernels
par(mfrow = c(1, 1)) # To plot in a single panel (or adjust as necessary)
plot(density(residuals_model, bw = 1, kernel = "rectangular"),
main = "Kernel Density Smoothing of Residuals",
xlab = "Residuals",
ylab = "Density",
col = "black", lwd = 2, ylim = c(0, 0.1))
lines(density(residuals_model, bw = 1, kernel = "triangular"), col = "red", lwd = 2)
lines(density(residuals_model, bw = 1, kernel = "epanechnikov"), col = "green", lwd = 2)
lines(density(residuals_model, bw = 1, kernel = "biweight"), col = "blue", lwd = 2)
lines(density(residuals_model, bw = 1, kernel = "gaussian"), col = "orange", lwd = 2)
# Add a legend
legend("topright", legend = c("rectangular", "triangular", "epanechnikov", "biweight", "gaussian"),
col = c("black", "red", "green", "blue", "orange"),
lty = 1, cex = 0.7)
# Define the log-likelihood function for a mixture of two normal distributions
logL <- function(param, x) {
# param: p (mixing weight), mu1, sd1, mu2, sd2 (mean and standard deviation for both components)
d1 <- dnorm(x, mean = param[2], sd = param[3]) # first normal distribution
d2 <- dnorm(x, mean = param[4], sd = param[5]) # second normal distribution
# The log-likelihood for the mixture model
-sum(log(param[1] * d1 + (1 - param[1]) * d2))
}
# Extract residuals from the transformed model
residuals_model <- residuals(model_wls)
# Set initial parameter guesses (p, mu1, sd1, mu2, sd2)
start_param <- c(p = 0.5, mu1 = mean(residuals_model), sd1 = sd(residuals_model),
mu2 = mean(residuals_model) + 1, sd2 = sd(residuals_model))
# Optimize using the optim() function
opt_result <- optim(start_param, logL, x = residuals_model,
method = "L-BFGS-B",
lower = c(0.01, rep(1, 4)), upper = c(0.99, rep(200, 4)))
# Print the estimated parameters
opt_result$par
## p mu1 sd1 mu2 sd2
## 0.3338319 1.0000000 9.3549730 1.0000000 5.2287271
# The estimated values will be the mixing proportion (p), means (mu1, mu2), and standard deviations (sd1, sd2)
Based on the results of the mixture model, it appears that the data is best described by a mixture of two normal distributions that are nearly identical. Both distributions share the same mean (mu1 = mu2 = 1.0), and while the standard deviations (sd1 = 9.35, sd2 = 5.23) differ slightly, the distributions remain very similar in shape. This suggests that the mixture model may not provide much additional insight beyond fitting a single normal distribution to the residuals.
In essence, the two components of the mixture model are so similar that the added complexity of using a mixture may not be justified. It is likely that a single normal distribution, with a high dispersion to account for the variability in the residuals, would suffice to describe the data accurately.
Therefore, the mixture model’s results suggest minimal value in separating the residuals into two distinct components, and a simpler model may be more appropriate.
# Plot the scatter plot of the predictor against the response (Calories_Burned)
plot(gym_clean$Session_Duration..hours., gym_clean$Calories_Burned_transform,
xlab = "Session_Duration..hours.", ylab = "Calories Burned_transform",
main = "Nadaraya-Watson Regression using Kernel Smoothing")
# Kernel smoothing with box kernel
lines(ksmooth(gym_clean$Session_Duration..hours., gym_clean$Calories_Burned_transform, kernel = "box", bandwidth = 0.25), col = "green", lwd = 1)
lines(ksmooth(gym_clean$Session_Duration..hours., gym_clean$Calories_Burned_transform, kernel = "box", bandwidth = 0.5), col = "green", lwd = 2)
lines(ksmooth(gym_clean$Session_Duration..hours., gym_clean$Calories_Burned_transform, kernel = "box", bandwidth = 0.75), col = "green", lwd = 3)
# Kernel smoothing with normal kernel
lines(ksmooth(gym_clean$Session_Duration..hours., gym_clean$Calories_Burned_transform, kernel = "normal", bandwidth = 0.25), col = "red", lwd = 1)
lines(ksmooth(gym_clean$Session_Duration..hours., gym_clean$Calories_Burned_transform, kernel = "normal", bandwidth = 0.5), col = "red", lwd = 2)
lines(ksmooth(gym_clean$Session_Duration..hours., gym_clean$Calories_Burned_transform, kernel = "normal", bandwidth = 0.75), col = "red", lwd = 3)
# Add legend for box kernel at the top-left
legend("topleft",
legend = c("Box Kernel, Bandwidth = 0.25",
"Box Kernel, Bandwidth = 0.50",
"Box Kernel, Bandwidth = 0.75"),
col = "green",
lwd = c(1, 2, 3))
# Add legend for normal kernel at the bottom-right
legend("bottomright",
legend = c("Normal Kernel, Bandwidth = 0.25",
"Normal Kernel, Bandwidth = 0.50",
"Normal Kernel, Bandwidth = 0.75"),
col = "red",
lwd = c(1, 2, 3))
The plot shows a positive relationship between Calories_Burned_transform and Session_Duration.hours., analyzed using Nadaraya-Watson regression with Box and Normal kernels across three bandwidths (h = 0.25, 0.50, 0.75).
Smaller bandwidths (h = 0.25) capture local fluctuations but appear noisy.
Larger bandwidths (h=0.75) oversmooth the data, losing some local variation.
Medium bandwidth (h=0.50) provides a good balance, showing a smooth, reliable trend.
Both kernels produce similar results, with the Normal kernel offering slightly smoother curves. The positive trend confirms that longer session durations are strongly associated with higher calories burned. A medium bandwidth is likely the most practical choice for interpretation.
# Plotting Calories_Burned vs Session_Duration..hours. with multiple LOESS spans
plot(gym_clean$Session_Duration..hours., gym_clean$Calories_Burned_transform,
xlab = "Session_Duration..hours.", ylab = "Calories_Burned_transform",
main = "Calories Burned vs Session_Duration with Different LOESS Spans")
# LOESS smoothing with span = 0.25
loess_fit_025 <- loess(gym_clean$Calories_Burned_transform ~ gym_clean$Session_Duration..hours., data = gym_clean, span = 0.25)
lines(loess_fit_025$x, fitted(loess_fit_025), col = "red", lwd = 2)
# LOESS smoothing with span = 0.5
loess_fit_050 <- loess(gym_clean$Calories_Burned_transform ~ gym_clean$Session_Duration..hours., data = gym_clean, span = 0.5)
lines(loess_fit_050$x, fitted(loess_fit_050), col = "blue", lwd = 2)
# LOESS smoothing with span = 0.75
loess_fit_075 <- loess(gym_clean$Calories_Burned_transform ~ gym_clean$Session_Duration..hours., data = gym_clean, span = 0.75)
lines(loess_fit_075$x, fitted(loess_fit_075), col = "green", lwd = 2)
# Add a legend to the plot
legend("topright", legend = c("span = 0.25", "span = 0.5", "span = 0.75"),
col = c("red", "blue", "green"), lwd = 2)
The LOESS regression shows a positive relationship between Session_Duration.hours. and Calories_Burned_transform, with different spans (0.25,0.5,0.75) affecting the smoothness of the curve.
A lower span (0.25) captures more local variability, while a higher span (0.75) smooths the global trend. The medium span (0.5) provides the best balance between detail and smoothness, confirming a strong, consistent positive trend in the data.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
gym5<- gym_clean[, c("Gender",
"Weight..kg.",
"Height..m.",
"Workout_Type",
"Session_Duration..hours.",
"Calories_Burned_transform",
"Fat_Percentage",
"Water_Intake..liters.",
"Workout_Frequency..days.week.",
"Experience_Level",
"BMI",
"Age",
"Max_BPM",
"Avg_BPM",
"Resting_BPM")]
gym5<-rename(gym5,Calories_Burned=Calories_Burned_transform)
gym2<-cbind(gym5,
week_time=gym5$Session_Duration..hours.*gym5$Workout_Frequency..days.week.)
Here we focus on the relationship between BMI and Workout_Type as well as Gender. First, we list all possible combinations of Workout_Type and Gender and calculate their corresponding mean BMI and standard deviation to understand the distribution of the data. We then use the Analysis of Variance (ANOVA) model to assess the relationship between BMI and Workout_Type, Gender, and their interactions.
attach(gym2)
table(Workout_Type,Gender)
## Gender
## Workout_Type Female Male
## Cardio 126 129
## HIIT 107 114
## Strength 123 133
## Yoga 106 133
aggregate(BMI,by=list(Workout_Type,Gender),FUN=mean)
## Group.1 Group.2 x
## 1 Cardio Female 23.02548
## 2 HIIT Female 22.80776
## 3 Strength Female 22.94309
## 4 Yoga Female 22.04358
## 5 Cardio Male 27.67519
## 6 HIIT Male 27.44912
## 7 Strength Male 26.01406
## 8 Yoga Male 26.53902
aggregate(BMI,by=list(Workout_Type,Gender),FUN=sd)
## Group.1 Group.2 x
## 1 Cardio Female 4.438491
## 2 HIIT Female 4.277944
## 3 Strength Female 4.598575
## 4 Yoga Female 4.560615
## 5 Cardio Male 7.827989
## 6 HIIT Male 7.275462
## 7 Strength Male 7.827133
## 8 Yoga Male 7.558851
fit=aov(BMI~Workout_Type*Gender)
summary(fit)
## Df Sum Sq Mean Sq F value Pr(>F)
## Workout_Type 3 142 47 1.178 0.317
## Gender 1 4252 4252 106.015 <2e-16 ***
## Workout_Type:Gender 3 110 37 0.916 0.432
## Residuals 963 38620 40
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
interaction.plot(Workout_Type,Gender,BMI,
type="b", col=c("red","blue"),pch=c(16,18),
main="BMI between Workout_Type and Gender")
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
plotmeans(BMI~interaction(Workout_Type,Gender, sep=" "), connect=list(c(1,3,5,7),c(2,4,6,8)),col=c("red","darkgreen"),main="Interaction Plot with 95% CIs", xlab="Treatment and Dose Combination")
library(HH)
## Loading required package: lattice
## Loading required package: grid
## Loading required package: latticeExtra
##
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
##
## layer
## Loading required package: multcomp
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
##
## geyser
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
##
## Attaching package: 'HH'
## The following object is masked from 'package:gplots':
##
## residplot
## The following objects are masked from 'package:car':
##
## logit, vif
## The following object is masked from 'package:base':
##
## is.R
interaction2wt(BMI~Workout_Type*Gender)
The first plot (BMI between Workout_Type and Gender line graph) highlights significant differences in BMI trends across Workout Type and Gender, with males showing greater sensitivity to workout type than females.
The second plot (Interaction Plot with 95% CIs) suggests that treatment type affects BMI differently for males and females, particularly with males showing a greater BMI increase under cardio training.
The third plot (BMI: main effects and 2-way interactions) show significant main effects of Gender and Workout Type on BMI, but no significant interaction effect between these factors. This indicates that while both gender and workout type independently influence BMI, the relationship between workout type and BMI does not differ significantly between males and females.
Gender: Males have significantly higher BMI values compared to females.
Workout Type: Different workout types (e.g., Cardio, HIIT, Strength, Yoga) are associated with variations in BMI, with Cardio linked to the highest BMI and Yoga to the lowest.
Non-Significant Interaction Effect: The interaction between Gender and Workout Type is not statistically significant, indicating that the relationship between Workout Type and BMI does not vary substantially by gender.
The four workout types have little impact on BMI. At the same time, males have a higher BMI than females. The trends in exercise forms are similar between different genders, while the influence of gender is relatively small in strength training.
gym2$Workout_Type <- as.factor(gym2$Workout_Type)
ancova(BMI~Calories_Burned+Workout_Type, data=gym2)
## Analysis of Variance Table
##
## Response: BMI
## Df Sum Sq Mean Sq F value Pr(>F)
## Calories_Burned 1 152 151.804 3.4241 0.06456 .
## Workout_Type 3 145 48.349 1.0906 0.35214
## Residuals 966 42827 44.334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ancova(BMI~Calories_Burned*Workout_Type, data=gym2)
## Analysis of Variance Table
##
## Response: BMI
## Df Sum Sq Mean Sq F value Pr(>F)
## Calories_Burned 1 152 151.804 3.4172 0.06483 .
## Workout_Type 3 145 48.349 1.0884 0.35307
## Calories_Burned:Workout_Type 3 47 15.503 0.3490 0.78990
## Residuals 963 42780 44.424
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the first plot:
The effect of Calories_Burned on BMI is not uniform across Workout Types.
Cardio and Strength exhibit positive relationships, Yoga shows a negative relationship, and HIIT shows no relationship.
These highlight a significant interaction between Calories_Burned and Workout Type in influencing BMI, suggesting that the impact of calorie expenditure varies depending on the type of exercise.
From the second plot:
Strength and Cardio show slight positive trends.
Yoga exhibits a slight negative trend.
HIIT shows no strong relationship.
These suggest that the effect of Calories_Burned on BMI is not uniform and depends on the type of workout.
Overall, we have studied the effects of week_time and Workout_Type on BMI, both with and without considering their interaction. From the chart, Workout_Type appears to influence BMI, with variations across workout types (e.g., slight positive trends for Cardio and Strength, and a slight negative trend for Yoga). The interaction between week_time and Workout_Type is minimal but warrants further statistical testing to confirm whether it is significant.
gym2$Gender <- as.factor(gym2$Gender)
ancova(BMI~Calories_Burned+Gender, data=gym2)
## Analysis of Variance Table
##
## Response: BMI
## Df Sum Sq Mean Sq F value Pr(>F)
## Calories_Burned 1 152 151.8 3.7765 0.05227 .
## Gender 1 4060 4060.4 101.0121 < 2e-16 ***
## Residuals 968 38911 40.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ancova(BMI~Calories_Burned*Gender, data=gym2)
## Analysis of Variance Table
##
## Response: BMI
## Df Sum Sq Mean Sq F value Pr(>F)
## Calories_Burned 1 152 151.8 3.7726 0.05239 .
## Gender 1 4060 4060.4 100.9093 < 2e-16 ***
## Calories_Burned:Gender 1 1 0.6 0.0141 0.90553
## Residuals 967 38911 40.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Males have higher and more variable BMI values compared to females.
Females display a tighter cluster of BMI values with less variability.
The trend lines for both genders are nearly flat, suggesting no significant relationship between Calories_Burned and BMI.
In the panel, the combined view confirms that gender differences exist in BMI levels but highlights the lack of a relationship between Calories_Burned and BMI for either gender.
For both males and females, the trend lines are nearly parallel, suggesting no significant interaction between Calories_Burned and Gender in predicting BMI.
The distribution of BMI for males remains higher and more variable than that of females.
Neither gender shows a meaningful relationship between Calories_Burned and BMI.
The overlapping parallel lines reinforce the conclusion that there is no significant interaction effect.
The results from both plots suggest that while Gender has a clear main effect on BMI, with males exhibiting higher and more variable BMI values than females, there is no significant relationship between Calories_Burned and BMI for either gender.
Furthermore, the interaction between Calories_Burned and Gender is negligible, as indicated by the parallel trend lines in the superposed views.
These findings imply that BMI differences are primarily driven by gender rather than variations in Calories_Burned or interactions between the two variables.
library(coin)
gym2<-transform(gym2,Gender= factor(Gender))
t.test(week_time~Gender,data=gym2,var.equal =TRUE )
##
## Two Sample t-test
##
## data: week_time by Gender
## t = 0.53941, df = 969, p-value = 0.5897
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -0.1984698 0.3489347
## sample estimates:
## mean in group Female mean in group Male
## 4.405390 4.330157
t.test(Calories_Burned~Gender,data=gym2,var.equal =TRUE )
##
## Two Sample t-test
##
## data: Calories_Burned by Gender
## t = -4.5006, df = 969, p-value = 7.598e-06
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -21.805868 -8.563712
## sample estimates:
## mean in group Female mean in group Male
## 220.9197 236.1045
t.test(Experience_Level~Gender,data=gym2,var.equal =TRUE )
##
## Two Sample t-test
##
## data: Experience_Level by Gender
## t = -0.0022392, df = 969, p-value = 0.9982
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -0.09327738 0.09306476
## sample estimates:
## mean in group Female mean in group Male
## 1.807359 1.807466
t.test(BMI~Gender,data=gym2,var.equal =TRUE )
##
## Two Sample t-test
##
## data: BMI by Gender
## t = -10.229, df = 969, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -4.964987 -3.366611
## sample estimates:
## mean in group Female mean in group Male
## 22.72784 26.89363
The two-sample t-test shows a significant difference in BMI between males and females (t = -10.229, p-value < 2.2e-16). The mean BMI for the female group (22.73) is significantly lower than that of the male group (26.89), as confirmed by the 95% confidence interval (-4.96, -3.37), which does not include 0. This indicates a statistically significant difference in BMI based on gender. However, this test does not address differences related to week_time, Experience_Level, or Calories_Burned.
### PCA
GYM<-subset(gym2,select = c(Avg_BPM,Session_Duration..hours., Calories_Burned,Fat_Percentage,Water_Intake..liters.,Workout_Frequency..days.week.,BMI))
GYM$Avg_BPM<- max(GYM$Avg_BPM) -GYM$Avg_BPM
GYM$Session_Duration..hours. <- max(GYM$Session_Duration..hours. ) -GYM$Session_Duration..hours.
GYM$Calories_Burned<- max(GYM$Calories_Burned) -GYM$Calories_Burned
GYM$Fat_Percentage<- max(GYM$Fat_Percentage) -GYM$Fat_Percentage
GYM$Water_Intake..liters.<- max(GYM$Water_Intake..liters.) -GYM$Water_Intake..liters.
GYM$Workout_Frequency..days.week.<- max(GYM$Workout_Frequency..days.week.) -GYM$Workout_Frequency..days.week.
BMI<- which(colnames(GYM) == "BMI")
### Scatterplot matrix for the heptathlon data
plot(GYM[, -BMI])
round(cor(GYM[, -BMI]), 2)
## Avg_BPM Session_Duration..hours. Calories_Burned
## Avg_BPM 1.00 0.01 0.33
## Session_Duration..hours. 0.01 1.00 0.91
## Calories_Burned 0.33 0.91 1.00
## Fat_Percentage 0.00 -0.58 -0.58
## Water_Intake..liters. -0.01 0.28 0.34
## Workout_Frequency..days.week. -0.01 0.64 0.57
## Fat_Percentage Water_Intake..liters.
## Avg_BPM 0.00 -0.01
## Session_Duration..hours. -0.58 0.28
## Calories_Burned -0.58 0.34
## Fat_Percentage 1.00 -0.59
## Water_Intake..liters. -0.59 1.00
## Workout_Frequency..days.week. -0.53 0.24
## Workout_Frequency..days.week.
## Avg_BPM -0.01
## Session_Duration..hours. 0.64
## Calories_Burned 0.57
## Fat_Percentage -0.53
## Water_Intake..liters. 0.24
## Workout_Frequency..days.week. 1.00
GYM_pca=prcomp(GYM[, -BMI],scale=TRUE)
print(GYM_pca)
## Standard deviations (1, .., p=6):
## [1] 1.7811707 1.0602010 0.9541980 0.6736985 0.5579315 0.1665942
##
## Rotation (n x k) = (6 x 6):
## PC1 PC2 PC3 PC4
## Avg_BPM 0.07584836 0.85038453 0.38970768 -0.25247910
## Session_Duration..hours. 0.50029629 0.05754146 -0.33535792 0.43620351
## Calories_Burned 0.50486770 0.30565120 -0.09529942 0.37127523
## Fat_Percentage -0.45674445 0.23928985 -0.25894401 0.18919531
## Water_Intake..liters. 0.31329249 -0.34297189 0.72253567 0.06880337
## Workout_Frequency..days.week. 0.42695617 -0.07227123 -0.37077110 -0.75339680
## PC5 PC6
## Avg_BPM -0.02330343 0.234376535
## Session_Duration..hours. 0.06298377 0.663089144
## Calories_Burned 0.07244745 -0.706760241
## Fat_Percentage 0.79449886 -0.007040758
## Water_Intake..liters. 0.50309237 0.065760635
## Workout_Frequency..days.week. 0.32543969 -0.038682751
summary(GYM_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 1.7812 1.0602 0.9542 0.67370 0.55793 0.16659
## Proportion of Variance 0.5288 0.1873 0.1517 0.07564 0.05188 0.00463
## Cumulative Proportion 0.5288 0.7161 0.8679 0.94349 0.99537 1.00000
From the first plot: Key driver of calories burned appears to be session duration, while other variables show weak or no correlation.
#权重
a1=GYM_pca$rotation[,1]
a1
## Avg_BPM Session_Duration..hours.
## 0.07584836 0.50029629
## Calories_Burned Fat_Percentage
## 0.50486770 -0.45674445
## Water_Intake..liters. Workout_Frequency..days.week.
## 0.31329249 0.42695617
plot(GYM_pca)
## A plot of the data in the space of the first two principal components,
## with the points labelled by the name of the corresponding competitor;
## the first two loadings for the events are given in a second coordinate system.
biplot(GYM_pca,col=c("blue","red"))
#绘制BMI与第一主成分得分的关系图
cor(GYM$BMI,GYM_pca$x[,1])
## [1] -0.08453751
plot(GYM$BMI, GYM_pca$x[,1])
PC1 is driven by workout-related variables (Session Duration, Calories Burned, and Workout Frequency). PC2 primarily reflects Fat Percentage, which is independent of the workout variables. The biplot highlights a strong correlation among workout-related variables, while Fat Percentage and Water Intake contribute unique dimensions to the data structure.
The scatterplot suggests that BMI does not have a strong or direct relationship with the first principal component (PC1). The majority of individuals fall within the BMI range of 20–30, with PC1 showing variability across this range. This implies that the variables driving PC1 (such as workout-related metrics) may not be significant predictors of BMI.
#### Hierarchical Clustering
pottery_dist=dist(subset(gym2,select = c(BMI,Workout_Frequency..days.week., Calories_Burned,Session_Duration..hours.)))
levelplot(as.matrix(pottery_dist),xlab="Pot Number",ylab="Pot Number")
pottery_single=hclust(pottery_dist,method="single")
pottery_complete=hclust(pottery_dist,method="complete")
pottery_average=hclust(pottery_dist,method="average")
layout(matrix(1:3,ncol=3))
plot(pottery_single,main="Single Linkage",sub="",xlab="")
plot(pottery_complete,main="Complete Linkage",sub="",xlab="")
plot(pottery_average,main="Average Linkage",sub="",xlab="")
The heatmap reveals patterns of similarity among the pot numbers based on their features. The gradient primarily shows lighter values, suggesting limited high similarity across the dataset. However, some blocks of darker shades indicate groups or clusters of similar observations.
The clusters are moderately compact and balanced, offering a compromise between the extremes of single and complete linkage.
#### k-means clustering
library("scatterplot3d")
par(no.readonly = TRUE)
## $xlog
## [1] FALSE
##
## $ylog
## [1] FALSE
##
## $adj
## [1] 0.5
##
## $ann
## [1] TRUE
##
## $ask
## [1] FALSE
##
## $bg
## [1] "white"
##
## $bty
## [1] "o"
##
## $cex
## [1] 1
##
## $cex.axis
## [1] 1
##
## $cex.lab
## [1] 1
##
## $cex.main
## [1] 1.2
##
## $cex.sub
## [1] 1
##
## $col
## [1] "black"
##
## $col.axis
## [1] "black"
##
## $col.lab
## [1] "black"
##
## $col.main
## [1] "black"
##
## $col.sub
## [1] "black"
##
## $crt
## [1] 0
##
## $err
## [1] 0
##
## $family
## [1] ""
##
## $fg
## [1] "black"
##
## $fig
## [1] 0 1 0 1
##
## $fin
## [1] 6.999999 4.999999
##
## $font
## [1] 1
##
## $font.axis
## [1] 1
##
## $font.lab
## [1] 1
##
## $font.main
## [1] 2
##
## $font.sub
## [1] 1
##
## $lab
## [1] 5 5 7
##
## $las
## [1] 0
##
## $lend
## [1] "round"
##
## $lheight
## [1] 1
##
## $ljoin
## [1] "round"
##
## $lmitre
## [1] 10
##
## $lty
## [1] "solid"
##
## $lwd
## [1] 1
##
## $mai
## [1] 1.02 0.82 0.82 0.42
##
## $mar
## [1] 5.1 4.1 4.1 2.1
##
## $mex
## [1] 1
##
## $mfcol
## [1] 1 1
##
## $mfg
## [1] 1 1 1 1
##
## $mfrow
## [1] 1 1
##
## $mgp
## [1] 3 1 0
##
## $mkh
## [1] 0.001
##
## $new
## [1] FALSE
##
## $oma
## [1] 0 0 0 0
##
## $omd
## [1] 0 1 0 1
##
## $omi
## [1] 0 0 0 0
##
## $pch
## [1] 1
##
## $pin
## [1] 5.759999 3.159999
##
## $plt
## [1] 0.1171429 0.9400000 0.2040000 0.8360000
##
## $ps
## [1] 12
##
## $pty
## [1] "m"
##
## $smo
## [1] 1
##
## $srt
## [1] 0
##
## $tck
## [1] NA
##
## $tcl
## [1] -0.5
##
## $usr
## [1] 0 1 0 1
##
## $xaxp
## [1] 0 1 5
##
## $xaxs
## [1] "r"
##
## $xaxt
## [1] "s"
##
## $xpd
## [1] FALSE
##
## $yaxp
## [1] 0 1 5
##
## $yaxs
## [1] "r"
##
## $yaxt
## [1] "s"
##
## $ylbias
## [1] 0.2
layout(matrix(1))
gym1<-subset(gym2,select = c(BMI,Workout_Frequency..days.week., Calories_Burned,Session_Duration..hours.))
scatterplot3d(gym1$BMI,gym1$Workout_Frequency..days.week., gym1$Calories_Burned,gym1$Session_Duration..hours.,type = "h", angle = 55, scale.y = 0.7, pch = 16, y.ticklabs = seq(0,10, by = 2), y.margin.add = 0.1)
rge <- apply(gym1, 2, max) - apply(gym1, 2, min)
gym1.dat <- sweep(gym1, 2, rge, FUN = "/") ### function = divide
n <- nrow(gym1.dat)
wss <- rep(0, 10)
wss[1] <- (n - 1) * sum(apply(gym1.dat, 2, var))
for (i in 2:10) wss[i] <- sum(kmeans(gym1.dat,centers = i)$withinss)
plot(1:10, wss, type = "b", xlab = "Number of groups", ylab = "Within groups sum of squares")
ccent=function(cl){
f=function(i) colMeans(gym1[cl==i,])
x=sapply(sort(unique(cl)),f)
colnames(x)=sort(unique(cl))
return(x)
}
gym1_kmeans2= kmeans(gym1.dat,centers=2)
table(gym1_kmeans2$cluster)
##
## 1 2
## 564 407
ccent(gym1_kmeans2$cluster)
## 1 2
## BMI 24.795904 25.071818
## Workout_Frequency..days.week. 2.650709 4.245700
## Calories_Burned 206.126112 260.410143
## Session_Duration..hours. 1.090337 1.483194
gym1_kmeans8= kmeans(gym1.dat,centers=8)
table(gym1_kmeans8$cluster)
##
## 1 2 3 4 5 6 7 8
## 94 185 101 102 93 80 223 93
ccent(gym1_kmeans8$cluster)
## 1 2 3 4
## BMI 24.38074 23.29989 24.528317 25.3676471
## Workout_Frequency..days.week. 2.00000 4.00000 5.000000 2.0000000
## Calories_Burned 235.78271 224.36397 299.379510 154.5194670
## Session_Duration..hours. 1.27266 1.24173 1.770792 0.7541176
## 5 6 7 8
## BMI 22.3225806 38.177125 22.675740 25.108925
## Workout_Frequency..days.week. 3.0000000 3.325000 3.000000 4.000000
## Calories_Burned 158.4321634 233.807385 231.575397 295.620811
## Session_Duration..hours. 0.8004301 1.217875 1.262108 1.722258
Clusters 1 and 4 represent less active groups with lower workout frequency and shorter sessions, with Cluster 4 showing a higher BMI.
Clusters 3 and 8 are the most active with high workout frequency, calorie burn, and longer session durations.
Cluster 6 stands out due to its extremely high BMI, indicating a potential focus area for tailored fitness interventions.
Clusters 5 and 7 are similar in BMI and calorie burn but differ slightly in session duration and frequency.
#### Model-based clustering
library("mclust")
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
##
## Attaching package: 'mclust'
## The following object is masked from 'package:mvtnorm':
##
## dmvnorm
gym1_mclust=Mclust(gym1.dat)
print(gym1_mclust)
## 'Mclust' model object: (VVE,4)
##
## Available components:
## [1] "call" "data" "modelName" "n"
## [5] "d" "G" "BIC" "loglik"
## [9] "df" "bic" "icl" "hypvol"
## [13] "parameters" "z" "classification" "uncertainty"
table(gym1_mclust$classification)
##
## 1 2 3 4
## 186 540 175 70
ccent(gym1_mclust$classification)
## 1 2 3 4
## BMI 24.745645 23.216463 24.8258857 38.643000
## Workout_Frequency..days.week. 4.543011 3.201852 2.4171429 3.228571
## Calories_Burned 298.030866 227.328797 153.1690064 246.374107
## Session_Duration..hours. 1.760215 1.249333 0.7452571 1.230714
plot.Mclust(gym1_mclust, what = "BIC",
ylim = range(gym1_mclust$BIC[,-(1:2)], na.rm = TRUE),
legendArgs = list(x = "bottomleft", cex =0.7))
clPairs(gym1.dat,classification = gym1_mclust$classification,symbols=1:4,col="black")
scatterplot3d(gym1$BMI, gym1$Workout_Frequency..days.week.,
gym1$Calories_Burned,gym1$Session_Duration..hours.,
type = "h", angle = 55,
scale.y = 0.7, pch = gym1_mclust$classification,
y.ticklabs = seq(0, 10, by = 2), y.margin.add = 0.1)
n<-nrow(gym2)
gym2$Workout_Type <- as.factor(gym2$Workout_Type)
gym2$Gender <- as.factor(gym2$Gender)
gym_up<-gym2[1:((n+1)/2),]
gym_down<-gym2[((n+1)/2+1):n,]
fit.logit <- glm(Gender~Age+Avg_BPM+week_time+Experience_Level+BMI+Weight..kg.+Height..m.+Calories_Burned, data=gym_up,family = binomial())
summary(fit.logit)
##
## Call:
## glm(formula = Gender ~ Age + Avg_BPM + week_time + Experience_Level +
## BMI + Weight..kg. + Height..m. + Calories_Burned, family = binomial(),
## data = gym_up)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -65.498924 16.074025 -4.075 4.60e-05 ***
## Age 0.040484 0.013112 3.088 0.002018 **
## Avg_BPM -0.049582 0.014144 -3.506 0.000456 ***
## week_time -0.976219 0.213606 -4.570 4.87e-06 ***
## Experience_Level 0.650589 0.426130 1.527 0.126826
## BMI 0.755162 0.313158 2.411 0.015890 *
## Weight..kg. -0.168318 0.106017 -1.588 0.112366
## Height..m. 35.153487 9.236568 3.806 0.000141 ***
## Calories_Burned 0.033084 0.006542 5.058 4.25e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 672.55 on 485 degrees of freedom
## Residual deviance: 308.65 on 477 degrees of freedom
## AIC: 326.65
##
## Number of Fisher Scoring iterations: 6
logit.fit.reduced=step(fit.logit)
## Start: AIC=326.65
## Gender ~ Age + Avg_BPM + week_time + Experience_Level + BMI +
## Weight..kg. + Height..m. + Calories_Burned
##
## Df Deviance AIC
## <none> 308.65 326.65
## - Experience_Level 1 311.01 327.01
## - Weight..kg. 1 311.22 327.22
## - BMI 1 314.84 330.84
## - Age 1 318.74 334.74
## - Avg_BPM 1 321.80 337.80
## - Height..m. 1 326.64 342.64
## - week_time 1 332.38 348.38
## - Calories_Burned 1 337.61 353.61
prob <- predict(logit.fit.reduced, gym_down, type="response")
logit.pred <- factor(prob > .5, levels=c(FALSE, TRUE),
labels=c("benign", "malignant"))
logit.perf <- table(gym_down$BMI, logit.pred,
dnn=c("Actual", "Predicted"))
Model-based clustering indicates that the dataset is best described by 2–3 clusters. Calories Burned and Session Duration play key roles in defining these clusters, while BMI and Workout Frequency show weaker differentiation. The BIC plot supports the identification of a model with a small number of components as the optimal clustering solution.
#### classical decision tree
library(rpart)
set.seed(12345)
#### grow the tree
dtree <- rpart(Gender ~ ., data=gym_up, method="class",
parms=list(split="information"))
### The complexity parameter (cp) is used to penalize larger trees.
### Tree size is defined by the number of branch splits (nsplit).
dtree$cptable
## CP nsplit rel error xerror xstd
## 1 0.64935065 0 1.00000000 1.00000000 0.04765911
## 2 0.17748918 1 0.35064935 0.35064935 0.03556640
## 3 0.06926407 2 0.17316017 0.17316017 0.02622813
## 4 0.02020202 3 0.10389610 0.11255411 0.02147511
## 5 0.01000000 6 0.04329004 0.07792208 0.01802308
plotcp(dtree)
#### prune the tree
dtree.pruned=prune(dtree,cp=.0177)
library(rpart.plot)
prp(dtree.pruned, type = 2, extra = 104,
fallen.leaves = TRUE, main="Decision Tree")
#### classifies new cases
dtree.pred <- predict(dtree.pruned, gym_down, type="class")
dtree.perf <- table(gym_down$Gender, dtree.pred,
dnn=c("Actual", "Predicted"))
dtree.perf
## Predicted
## Actual Female Male
## Female 227 4
## Male 11 243
Individuals with Water_Intake < 2.8 and Weight < 81 are predominantly Female. Conversely, Water_Intake ≥ 2.8 almost exclusively classifies individuals as Male.
library(party)
## Loading required package: modeltools
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:car':
##
## Predict
## Loading required package: strucchange
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
##
## Attaching package: 'party'
## The following object is masked from 'package:dplyr':
##
## where
fit.ctree <- ctree(Gender~., data=gym_up)
plot(fit.ctree, main="Conditional Inference Tree")
ctree.pred <- predict(fit.ctree, gym_down, type="response")
ctree.perf <- table(gym_down$Gender, ctree.pred,
dnn=c("Actual", "Predicted"))
ctree.perf
## Predicted
## Actual Female Male
## Female 226 5
## Male 23 231
Primary Split: Water intake (≤2.7 liters) is the strongest determinant, classifying individuals predominantly as Female.
Weight Threshold: For those consuming ≤2.7 liters, weight is a critical factor:
Individuals ≤79.7 kg are more likely Female.
Individuals >79.7 kg show a higher Male classification.
Height Influence: Among those ≤79.7 kg, height further refines classification:
Those ≤1.79 meters are overwhelmingly Female.
Those >1.79 meters are more evenly split but skew slightly Male.
Key Predictors: Water intake, weight, and height are the dominant factors driving classification in this model.
#### Random Forest
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(1234)
### grow the forest
fit.forest <- randomForest(Gender~., data=gym_up,
na.action=na.roughfix,
importance=TRUE)
print(head(fit.forest, 2))
## $call
## randomForest(formula = Gender ~ ., data = gym_up, importance = TRUE,
## na.action = na.roughfix)
##
## $type
## [1] "classification"
#### determine variable importance
importance(fit.forest,type=2)
## MeanDecreaseGini
## Weight..kg. 47.097738
## Height..m. 35.484884
## Workout_Type 1.784627
## Session_Duration..hours. 6.565561
## Calories_Burned 5.493744
## Fat_Percentage 33.823725
## Water_Intake..liters. 72.236791
## Workout_Frequency..days.week. 1.312308
## Experience_Level 2.673993
## BMI 16.384282
## Age 2.962519
## Max_BPM 3.246921
## Avg_BPM 3.666733
## Resting_BPM 2.925477
## week_time 6.283355
#### classifies new cases
forest.pred <- predict(fit.forest, gym_down)
forest.perf <- table(gym_down$Gender, forest.pred,
dnn = c("Actual", "Predicted"))
accuracy <- sum(diag(forest.perf)) / sum(forest.perf)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9731959
Key Variables:
Most Important: Water intake, weight, and height have the highest MeanDecreaseGini scores, indicating their strong predictive power for determining gender.
Moderate Importance: Session duration, calories burned, and BMI also contribute but to a lesser extent.
Least Important: Variables like workout type and workout frequency have minimal predictive influence.
Model Accuracy: The model achieves an accuracy of 97.32%, demonstrating excellent predictive performance.
#### Support vector machines
library(e1071)
set.seed(1234)
fit.svm <- svm(Gender~., data=gym_up)
fit.svm
##
## Call:
## svm(formula = Gender ~ ., data = gym_up)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 170
svm.pred <- predict(fit.svm, na.omit(gym_down))
svm.perf <- table(na.omit(gym_down)$Gender,
svm.pred, dnn=c("Actual", "Predicted"))
accuracy <- sum(diag(svm.perf)) / sum(svm.perf)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.9505155
#### Tuning an RBF support vector machine
set.seed(1234)
### varies the parameters
tuned <- tune.svm(Workout_Type~., data=gym_up,
gamma=10^(-4:2),
cost=10^(-2:5))
tuned ### print the best model
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 1 10
##
## - best performance: 0.7222789
### fit the model with tuned parameters
fit.svm <- svm(Workout_Type~., data=gym_up, gamma=0.1, cost=10)
svm.pred <- predict(fit.svm, na.omit(gym2))
#### evaluate the cross-validation performance
svm.perf <- table(na.omit(gym2)$Workout_Type,
svm.pred, dnn=c("Actual", "Predicted"))
accuracy <- sum(diag(svm.perf)) / sum(svm.perf)
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.6035015
Overall:
Lower accuracy compared to Random Forest in this context.
Performance highly depends on proper parameter tuning.
In conclusion, the two most significant contributors to the number of calories burned is session duration and gender.
We believe that this project is particularly impactful and deserves more insightful research as more and more people are becoming more health-conscious, with the number of gym-goers increasing yearly. Knowing that extending the length of each exercise routine helps to better achieve weight loss goals, as well as produce higher levels of dopamine, may further motivate the human population to exercise and gain the necessary physical health benefits. Such research will definitely be advantageous to mankind, particularly those in the fitness industry, or even the average person.
Moreover, we must also acknowledge the limitations of the study.
One, the limited sample size of the study did not indicate the heritage or genetic backgrounds of the participants, which means that this study cannot represent the effectiveness of session duration on calories burned for every person.
Secondly, the study did not consider factors, such as the intensity of exercises, as stereotypes like Pilates being less intense than running may not always hold true.
Thirdly, the variables used may be subjective, just as the intensity is rated differently by people with different pain tolerances and experiences.
All in all, the study is very useful, providing many insights and possible points of further research.